73#ifdef EXPENSIVE_CHECKS
105using namespace slpvectorizer;
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
110STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
114 cl::desc(
"Run the SLP vectorization passes"));
118 cl::desc(
"Enable vectorization for wider vector utilization"));
122 cl::desc(
"Only vectorize if you gain more than this "
127 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
132 cl::desc(
"Attempt to vectorize horizontal reductions"));
137 "Attempt to vectorize horizontal reductions feeding into a store"));
141 cl::desc(
"Attempt to vectorize for this register size in bits"));
145 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
153 cl::desc(
"Limit the size of the SLP scheduling region per block"));
157 cl::desc(
"Attempt to vectorize for this register size in bits"));
161 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
165 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
171 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
180 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
184 cl::desc(
"The minimum number of loads, which should be considered strided, "
185 "if the stride is > 1 or is runtime value"));
189 cl::desc(
"The maximum stride, considered to be profitable."));
193 cl::desc(
"Display the SLP trees with Graphviz"));
197 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
229 Ty = Ty->getScalarType();
230 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
231 !Ty->isPPC_FP128Ty();
240 return SI->getValueOperand()->getType();
242 return CI->getOperand(0)->getType();
244 return IE->getOperand(1)->getType();
251 "ScalableVectorType is not supported.");
253 return VecTy->getNumElements();
267 Type *Ty,
unsigned Sz) {
272 if (NumParts == 0 || NumParts >= Sz)
286 I * VecTyNumElements, VecTyNumElements)))
288 : Mask[
I] * VecTyNumElements + J;
322 unsigned SVNumElements =
324 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
325 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
326 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
328 unsigned NumGroup = 0;
329 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
331 Value *Src = SV->getOperand(0);
337 if (SV->getOperand(0) != Src)
340 if (!SV->isExtractSubvectorMask(
Index))
342 ExpectedIndex.
set(
Index / ShuffleMaskSize);
346 if (!ExpectedIndex.
all())
350 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
369 unsigned SVNumElements =
372 unsigned AccumulateLength = 0;
373 for (
Value *V : VL) {
375 for (
int M : SV->getShuffleMask())
377 : AccumulateLength + M);
378 AccumulateLength += SVNumElements;
419 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
428 OS <<
"Idx: " << Idx <<
", ";
429 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
444 for (
int I = 1, E = VL.
size();
I < E;
I++) {
449 if (BB !=
II->getParent())
466 Value *FirstNonUndef =
nullptr;
467 for (
Value *V : VL) {
470 if (!FirstNonUndef) {
474 if (V != FirstNonUndef)
477 return FirstNonUndef !=
nullptr;
483 return Cmp->isCommutative();
485 return BO->isCommutative() ||
486 (BO->getOpcode() == Instruction::Sub &&
492 ICmpInst::Predicate Pred;
493 if (match(U.getUser(),
494 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
495 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
499 return match(U.getUser(),
500 m_Intrinsic<Intrinsic::abs>(
501 m_Specific(U.get()), m_ConstantInt(Flag))) &&
502 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
505 (BO->getOpcode() == Instruction::FSub &&
508 return match(U.getUser(),
509 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
511 return I->isCommutative();
517 static_assert(std::is_same_v<T, InsertElementInst> ||
518 std::is_same_v<T, ExtractElementInst>,
528 if (CI->getValue().uge(VT->getNumElements()))
530 Index *= VT->getNumElements();
531 Index += CI->getZExtValue();
553 Type *CurrentType =
IV->getType();
554 for (
unsigned I :
IV->indices()) {
556 Index *= ST->getNumElements();
557 CurrentType = ST->getElementType(
I);
559 Index *= AT->getNumElements();
560 CurrentType = AT->getElementType();
593 if (MaskArg == UseMask::UndefsAsMask)
597 if (MaskArg == UseMask::FirstArg &&
Value < VF)
598 UseMask.reset(
Value);
599 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
600 UseMask.reset(
Value - VF);
608template <
bool IsPoisonOnly = false>
612 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
620 if (!UseMask.empty()) {
631 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
646 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
649 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
677static std::optional<TargetTransformInfo::ShuffleKind>
683 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
684 auto *EI = dyn_cast<ExtractElementInst>(V);
687 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
690 return std::max(S, VTy->getNumElements());
693 Value *Vec1 =
nullptr;
694 Value *Vec2 =
nullptr;
699 Value *Vec = EE->getVectorOperand();
705 ShuffleMode CommonShuffleMode =
Unknown;
707 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
714 auto *Vec = EI->getVectorOperand();
728 if (Idx->getValue().uge(
Size))
730 unsigned IntIdx = Idx->getValue().getZExtValue();
737 if (!Vec1 || Vec1 == Vec) {
739 }
else if (!Vec2 || Vec2 == Vec) {
745 if (CommonShuffleMode == Permute)
749 if (Mask[
I] %
Size !=
I) {
750 CommonShuffleMode = Permute;
753 CommonShuffleMode =
Select;
756 if (CommonShuffleMode ==
Select && Vec2)
766 unsigned Opcode = E->getOpcode();
767 assert((Opcode == Instruction::ExtractElement ||
768 Opcode == Instruction::ExtractValue) &&
769 "Expected extractelement or extractvalue instruction.");
770 if (Opcode == Instruction::ExtractElement) {
774 return CI->getZExtValue();
777 if (EI->getNumIndices() != 1)
779 return *EI->idx_begin();
785struct InstructionsState {
787 Value *OpValue =
nullptr;
798 unsigned getAltOpcode()
const {
803 bool isAltShuffle()
const {
return AltOp != MainOp; }
807 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
810 InstructionsState() =
delete;
812 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
831 unsigned BaseIndex = 0);
841 BaseOp0 == Op0 || BaseOp1 == Op1 ||
852 "Assessing comparisons of different types?");
862 return (BasePred == Pred &&
864 (BasePred == SwappedPred &&
873 unsigned BaseIndex) {
876 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
885 unsigned AltOpcode = Opcode;
886 unsigned AltIndex = BaseIndex;
888 bool SwappedPredsCompatible = [&]() {
892 UniquePreds.
insert(BasePred);
893 UniqueNonSwappedPreds.
insert(BasePred);
894 for (
Value *V : VL) {
901 UniqueNonSwappedPreds.
insert(CurrentPred);
902 if (!UniquePreds.
contains(CurrentPred) &&
903 !UniquePreds.
contains(SwappedCurrentPred))
904 UniquePreds.
insert(CurrentPred);
909 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
920 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
922 for (
int Cnt = 0, E = VL.
size(); Cnt < E; Cnt++) {
924 unsigned InstOpcode =
I->getOpcode();
926 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
930 AltOpcode = InstOpcode;
935 Value *Op0 = IBase->getOperand(0);
937 Value *Op1 =
I->getOperand(0);
940 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
942 if (Opcode == AltOpcode) {
945 "Cast isn't safe for alternation, logic needs to be updated!");
946 AltOpcode = InstOpcode;
953 Type *Ty0 = BaseInst->getOperand(0)->getType();
954 Type *Ty1 = Inst->getOperand(0)->getType();
956 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
963 if ((E == 2 || SwappedPredsCompatible) &&
964 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
970 if (AltIndex != BaseIndex) {
973 }
else if (BasePred != CurrentPred) {
976 "CmpInst isn't safe for alternation, logic needs to be updated!");
981 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
982 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
985 }
else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
987 if (Gep->getNumOperands() != 2 ||
988 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
989 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
992 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
995 if (!LI->isSimple() || !BaseLI->isSimple())
996 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
1000 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
1006 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
1009 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
1012 if (Mappings.
size() != BaseMappings.
size() ||
1013 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1014 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1015 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1016 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1017 Mappings.
front().Shape.Parameters !=
1018 BaseMappings.
front().Shape.Parameters)
1019 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
1024 return InstructionsState(VL[BaseIndex],
nullptr,
nullptr);
1044 unsigned Opcode = UserInst->
getOpcode();
1046 case Instruction::Load: {
1050 case Instruction::Store: {
1052 return (SI->getPointerOperand() == Scalar);
1054 case Instruction::Call: {
1058 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
1059 Arg.value().get() == Scalar;
1079 return LI->isSimple();
1081 return SI->isSimple();
1083 return !
MI->isVolatile();
1091 bool ExtendingManyInputs =
false) {
1092 if (SubMask.
empty())
1095 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1097 (SubMask.
size() == Mask.size() &&
1098 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
1099 [](
int Idx) { return Idx == PoisonMaskElem; }))) &&
1100 "SubMask with many inputs support must be larger than the mask.");
1102 Mask.append(SubMask.
begin(), SubMask.
end());
1106 int TermValue = std::min(Mask.size(), SubMask.
size());
1107 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1109 (!ExtendingManyInputs &&
1110 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1112 NewMask[
I] = Mask[SubMask[
I]];
1128 const unsigned Sz = Order.
size();
1131 for (
unsigned I = 0;
I < Sz; ++
I) {
1133 UnusedIndices.
reset(Order[
I]);
1135 MaskedIndices.
set(
I);
1137 if (MaskedIndices.
none())
1140 "Non-synced masked/available indices.");
1144 assert(Idx >= 0 &&
"Indices must be synced.");
1155 Type *ScalarTy = VL[0]->getType();
1160 OpcodeMask.
set(Lane * ScalarTyNumElements,
1161 Lane * ScalarTyNumElements + ScalarTyNumElements);
1170 const unsigned E = Indices.
size();
1172 for (
unsigned I = 0;
I <
E; ++
I)
1173 Mask[Indices[
I]] =
I;
1179 assert(!Mask.empty() &&
"Expected non-empty mask.");
1183 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1185 Scalars[Mask[
I]] = Prev[
I];
1198 auto *IO = dyn_cast<Instruction>(V);
1201 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1214 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1216 auto *IU = dyn_cast<Instruction>(U);
1219 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1235 return !VL.
empty() &&
1251 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1255namespace slpvectorizer {
1260 struct ScheduleData;
1285 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1286 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1337 return !VectorizableTree.
empty() &&
1338 !VectorizableTree.
front()->UserTreeIndices.empty();
1343 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1344 return VectorizableTree.
front()->Scalars;
1350 return MinBWs.at(VectorizableTree.
front().get()).second;
1365 VectorizableTree.
clear();
1366 ScalarToTreeEntry.clear();
1367 MultiNodeScalars.clear();
1369 NonScheduledFirst.
clear();
1370 EntryToLastInstruction.clear();
1371 GatheredLoadsEntriesFirst = NoGatheredLoads;
1372 ExternalUses.
clear();
1373 ExternalUsesAsOriginalScalar.clear();
1374 for (
auto &Iter : BlocksSchedules) {
1375 BlockScheduling *BS = Iter.second.get();
1379 ReductionBitWidth = 0;
1381 CastMaxMinBWSizes.reset();
1382 ExtraBitWidthNodes.
clear();
1383 InstrElementSize.clear();
1384 UserIgnoreList =
nullptr;
1385 PostponedGathers.
clear();
1386 ValueToGatherNodes.clear();
1402 assert(!Order.
empty() &&
"expected non-empty order");
1403 const unsigned Sz = Order.
size();
1405 return P.value() ==
P.index() ||
P.value() == Sz;
1458 return MaxVecRegSize;
1463 return MinVecRegSize;
1471 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1473 return MaxVF ? MaxVF : UINT_MAX;
1519 unsigned *BestVF =
nullptr,
1520 bool TryRecursiveCheck =
true)
const;
1528 template <
typename T>
1555 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1556 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1578 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1579 MaxLevel(MaxLevel) {}
1635 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1640 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1642 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1645 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1648 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1650 ((
int)V1->getNumUses() == NumLanes ||
1651 AllUsersAreInternal(V1, V2)))
1657 auto CheckSameEntryOrFail = [&]() {
1658 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1659 TE1 && TE1 == R.getTreeEntry(V2))
1667 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1669 return CheckSameEntryOrFail();
1672 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1673 LI2->getPointerOperand(),
DL, SE,
true);
1674 if (!Dist || *Dist == 0) {
1677 R.TTI->isLegalMaskedGather(
1680 return CheckSameEntryOrFail();
1684 if (std::abs(*Dist) > NumLanes / 2)
1712 Value *EV2 =
nullptr;
1725 int Dist = Idx2 - Idx1;
1728 if (std::abs(Dist) == 0)
1730 if (std::abs(Dist) > NumLanes / 2)
1737 return CheckSameEntryOrFail();
1743 if (I1->getParent() != I2->getParent())
1744 return CheckSameEntryOrFail();
1751 if (S.getOpcode() &&
1753 !S.isAltShuffle()) &&
1765 return CheckSameEntryOrFail();
1799 int ShallowScoreAtThisLevel =
1810 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1813 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1815 ShallowScoreAtThisLevel))
1816 return ShallowScoreAtThisLevel;
1817 assert(I1 && I2 &&
"Should have early exited.");
1824 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1825 OpIdx1 != NumOperands1; ++OpIdx1) {
1827 int MaxTmpScore = 0;
1828 unsigned MaxOpIdx2 = 0;
1829 bool FoundBest =
false;
1833 ? I2->getNumOperands()
1834 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1835 assert(FromIdx <= ToIdx &&
"Bad index");
1836 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1838 if (Op2Used.
count(OpIdx2))
1843 I1, I2, CurrLevel + 1, {});
1846 TmpScore > MaxTmpScore) {
1847 MaxTmpScore = TmpScore;
1854 Op2Used.
insert(MaxOpIdx2);
1855 ShallowScoreAtThisLevel += MaxTmpScore;
1858 return ShallowScoreAtThisLevel;
1889 struct OperandData {
1890 OperandData() =
default;
1891 OperandData(
Value *V,
bool APO,
bool IsUsed)
1892 : V(V), APO(APO), IsUsed(IsUsed) {}
1902 bool IsUsed =
false;
1911 enum class ReorderingMode {
1928 const Loop *L =
nullptr;
1931 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
1932 return OpsVec[OpIdx][Lane];
1936 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
1937 return OpsVec[OpIdx][Lane];
1942 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
1943 OpIdx != NumOperands; ++OpIdx)
1944 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1946 OpsVec[OpIdx][Lane].IsUsed =
false;
1950 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
1951 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1963 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
1965 Value *IdxLaneV = getData(Idx, Lane).V;
1973 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1978 unsigned UniquesCount = Uniques.
size();
1979 auto IdxIt = Uniques.
find(IdxLaneV);
1980 unsigned UniquesCntWithIdxLaneV =
1981 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
1982 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1983 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
1984 unsigned UniquesCntWithOpIdxLaneV =
1985 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
1986 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1988 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
1989 UniquesCntWithOpIdxLaneV,
1990 UniquesCntWithOpIdxLaneV -
1992 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
1993 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
1994 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2003 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2004 Value *IdxLaneV = getData(Idx, Lane).V;
2005 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2017 return R.areAllUsersVectorized(IdxLaneI)
2025 static const int ScoreScaleFactor = 10;
2033 int Lane,
unsigned OpIdx,
unsigned Idx,
2043 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2044 if (Score <= -SplatScore) {
2048 Score += SplatScore;
2054 Score *= ScoreScaleFactor;
2055 Score += getExternalUseScore(Lane, OpIdx, Idx);
2073 std::optional<unsigned>
2074 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2078 unsigned NumOperands = getNumOperands();
2081 Value *OpLastLane = getData(OpIdx, LastLane).V;
2084 ReorderingMode RMode = ReorderingModes[OpIdx];
2085 if (RMode == ReorderingMode::Failed)
2086 return std::nullopt;
2089 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2095 std::optional<unsigned> Idx;
2099 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2105 bool IsUsed = RMode == ReorderingMode::Splat ||
2106 RMode == ReorderingMode::Constant ||
2107 RMode == ReorderingMode::Load;
2109 for (
unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2111 OperandData &OpData = getData(Idx, Lane);
2113 bool OpAPO = OpData.APO;
2122 if (OpAPO != OpIdxAPO)
2127 case ReorderingMode::Load:
2128 case ReorderingMode::Opcode: {
2129 bool LeftToRight = Lane > LastLane;
2130 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2131 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2132 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2133 OpIdx, Idx, IsUsed, UsedLanes);
2134 if (Score >
static_cast<int>(BestOp.Score) ||
2135 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2138 BestOp.Score = Score;
2139 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2143 case ReorderingMode::Constant:
2145 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2149 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2156 case ReorderingMode::Splat:
2158 IsUsed =
Op == OpLastLane;
2159 if (
Op == OpLastLane) {
2161 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2167 case ReorderingMode::Failed:
2173 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2177 return std::nullopt;
2184 unsigned getBestLaneToStartReordering()
const {
2185 unsigned Min = UINT_MAX;
2186 unsigned SameOpNumber = 0;
2197 for (
int I = getNumLanes();
I > 0; --
I) {
2198 unsigned Lane =
I - 1;
2199 OperandsOrderData NumFreeOpsHash =
2200 getMaxNumOperandsThatCanBeReordered(Lane);
2203 if (NumFreeOpsHash.NumOfAPOs < Min) {
2204 Min = NumFreeOpsHash.NumOfAPOs;
2205 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2207 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2208 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2209 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2212 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2213 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2214 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2215 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2216 auto *It = HashMap.
find(NumFreeOpsHash.Hash);
2217 if (It == HashMap.
end())
2218 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2224 unsigned BestLane = 0;
2225 unsigned CntMin = UINT_MAX;
2227 if (
Data.second.first < CntMin) {
2228 CntMin =
Data.second.first;
2229 BestLane =
Data.second.second;
2236 struct OperandsOrderData {
2239 unsigned NumOfAPOs = UINT_MAX;
2242 unsigned NumOpsWithSameOpcodeParent = 0;
2256 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2257 unsigned CntTrue = 0;
2258 unsigned NumOperands = getNumOperands();
2268 bool AllUndefs =
true;
2269 unsigned NumOpsWithSameOpcodeParent = 0;
2273 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2274 const OperandData &OpData = getData(OpIdx, Lane);
2281 I->getParent() != Parent) {
2282 if (NumOpsWithSameOpcodeParent == 0) {
2283 NumOpsWithSameOpcodeParent = 1;
2287 --NumOpsWithSameOpcodeParent;
2290 ++NumOpsWithSameOpcodeParent;
2299 OperandsOrderData
Data;
2300 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2301 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2309 assert((empty() || VL.
size() == getNumLanes()) &&
2310 "Expected same number of lanes");
2313 constexpr unsigned IntrinsicNumOperands = 2;
2315 NumOperands = IntrinsicNumOperands;
2316 OpsVec.
resize(NumOperands);
2317 unsigned NumLanes = VL.
size();
2318 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2319 OpsVec[OpIdx].
resize(NumLanes);
2320 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2333 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2341 unsigned getNumOperands()
const {
return OpsVec.
size(); }
2344 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2347 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2348 return getData(OpIdx, Lane).V;
2352 bool empty()
const {
return OpsVec.
empty(); }
2355 void clear() { OpsVec.
clear(); }
2360 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2361 bool OpAPO = getData(OpIdx, Lane).APO;
2362 bool IsInvariant = L && L->isLoopInvariant(
Op);
2364 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2368 bool FoundCandidate =
false;
2369 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2370 OperandData &
Data = getData(OpI, Ln);
2371 if (
Data.APO != OpAPO ||
Data.IsUsed)
2373 Value *OpILane = getValue(OpI, Lane);
2398 L->isLoopInvariant(
Data.V))) {
2399 FoundCandidate =
true;
2406 if (!FoundCandidate)
2409 return getNumLanes() == 2 || Cnt > 1;
2414 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2415 bool OpAPO = getData(OpIdx, Lane).APO;
2416 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2420 const OperandData &
Data = getData(OpI, Ln);
2421 if (
Data.APO != OpAPO ||
Data.IsUsed)
2423 Value *OpILn = getValue(OpI, Ln);
2424 return (L && L->isLoopInvariant(OpILn)) ||
2436 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2440 appendOperandsOfVL(RootVL);
2447 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2448 "Expected same num of lanes across all operands");
2449 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2450 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2458 unsigned NumOperands = getNumOperands();
2459 unsigned NumLanes = getNumLanes();
2479 unsigned FirstLane = getBestLaneToStartReordering();
2482 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2483 Value *OpLane0 = getValue(OpIdx, FirstLane);
2487 ReorderingModes[OpIdx] = ReorderingMode::Load;
2490 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2491 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2492 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2494 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2496 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2499 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2502 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2509 auto &&SkipReordering = [
this]() {
2512 for (
const OperandData &
Data : Op0)
2515 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2534 if (SkipReordering())
2537 bool StrategyFailed =
false;
2545 for (
unsigned I = 0;
I < NumOperands; ++
I)
2546 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2549 UsedLanes.
set(FirstLane);
2550 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2553 int Lane = FirstLane +
Direction * Distance;
2554 if (Lane < 0 || Lane >= (
int)NumLanes)
2556 UsedLanes.
set(Lane);
2558 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2561 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2563 std::optional<unsigned> BestIdx =
2564 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2565 MainAltOps[OpIdx], UsedLanes);
2572 swap(OpIdx, *BestIdx, Lane);
2575 StrategyFailed =
true;
2578 if (MainAltOps[OpIdx].
size() != 2) {
2579 OperandData &AltOp = getData(OpIdx, Lane);
2580 InstructionsState OpS =
2582 if (OpS.getOpcode() && OpS.isAltShuffle())
2589 if (!StrategyFailed)
2594#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2597 case ReorderingMode::Load:
2599 case ReorderingMode::Opcode:
2601 case ReorderingMode::Constant:
2603 case ReorderingMode::Splat:
2605 case ReorderingMode::Failed:
2626 const unsigned Indent = 2;
2629 OS <<
"Operand " << Cnt++ <<
"\n";
2630 for (
const OperandData &OpData : OpDataVec) {
2632 if (
Value *V = OpData.V)
2636 OS <<
", APO:" << OpData.APO <<
"}\n";
2658 int BestScore = Limit;
2659 std::optional<int>
Index;
2660 for (
int I :
seq<int>(0, Candidates.size())) {
2662 Candidates[
I].second,
2665 if (Score > BestScore) {
2680 DeletedInstructions.insert(
I);
2685 template <
typename T>
2688 for (
T *V : DeadVals) {
2690 DeletedInstructions.insert(
I);
2693 for (
T *V : DeadVals) {
2694 if (!V || !Processed.
insert(V).second)
2699 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2700 Entries.push_back(Entry);
2701 auto It = MultiNodeScalars.find(
I);
2702 if (It != MultiNodeScalars.end())
2703 Entries.append(It->second.begin(), It->second.end());
2705 for (
Use &U :
I->operands()) {
2707 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2709 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2710 return Entry->VectorizedValue == OpI;
2714 I->dropAllReferences();
2716 for (
T *V : DeadVals) {
2718 if (!
I->getParent())
2723 cast<Instruction>(U.getUser()));
2725 "trying to erase instruction with users.");
2726 I->removeFromParent();
2730 while (!DeadInsts.
empty()) {
2733 if (!VI || !VI->getParent())
2736 "Live instruction found in dead worklist!");
2737 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2744 for (
Use &OpU : VI->operands()) {
2745 Value *OpV = OpU.get();
2757 if (!DeletedInstructions.contains(OpI) &&
2762 VI->removeFromParent();
2763 DeletedInstructions.insert(VI);
2771 return AnalyzedReductionsRoots.count(
I);
2776 AnalyzedReductionsRoots.insert(
I);
2790 AnalyzedReductionsRoots.clear();
2791 AnalyzedReductionVals.
clear();
2792 AnalyzedMinBWVals.
clear();
2804 return NonScheduledFirst.
contains(V);
2817 bool collectValuesToDemote(
const TreeEntry &E,
bool IsProfitableToDemoteRoot,
2821 unsigned &MaxDepthLevel,
2822 bool &IsProfitableToDemote,
2823 bool IsTruncRoot)
const;
2833 canReorderOperands(TreeEntry *UserTE,
2840 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2844 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2846 TreeEntry *TE =
nullptr;
2848 TE = getTreeEntry(V);
2849 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2851 auto It = MultiNodeScalars.find(V);
2852 if (It != MultiNodeScalars.end()) {
2853 for (TreeEntry *E : It->second) {
2854 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2862 if (It != VL.
end()) {
2863 assert(
TE->isSame(VL) &&
"Expected same scalars.");
2871 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
2872 unsigned OpIdx)
const {
2873 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
2874 const_cast<TreeEntry *
>(UserTE), OpIdx);
2878 bool areAllUsersVectorized(
2887 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
2892 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
2896 getCastContextHint(
const TreeEntry &TE)
const;
2905 const EdgeInfo &EI);
2916 bool ResizeAllowed =
false)
const;
2925 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
2926 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
2927 unsigned NodeIdx)
const {
2928 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
2935 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
2940 template <
typename BVTy,
typename ResTy,
typename...
Args>
2941 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
2946 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
2947 bool PostponedPHIs);
2953 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
2960 std::optional<TargetTransformInfo::ShuffleKind>
2972 unsigned NumParts)
const;
2984 std::optional<TargetTransformInfo::ShuffleKind>
2985 isGatherShuffledSingleRegisterEntry(
3002 isGatherShuffledEntry(
3005 unsigned NumParts,
bool ForOrder =
false);
3012 Type *ScalarTy)
const;
3016 void setInsertPointAfterBundle(
const TreeEntry *E);
3024 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3029 void tryToVectorizeGatheredLoads(
3043 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3059 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3063 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3080 [Scalars](
Value *V,
int Idx) {
3081 return (isa<UndefValue>(V) &&
3082 Idx == PoisonMaskElem) ||
3083 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3086 if (!ReorderIndices.
empty()) {
3093 return IsSame(Scalars, Mask);
3094 if (VL.
size() == ReuseShuffleIndices.
size()) {
3096 return IsSame(Scalars, Mask);
3100 return IsSame(Scalars, ReuseShuffleIndices);
3103 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3104 return isGather() && !UserTreeIndices.empty() &&
3105 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3106 UserTreeIndices.front().UserTE == UserEI.UserTE;
3110 bool hasEqualOperands(
const TreeEntry &TE)
const {
3111 if (
TE.getNumOperands() != getNumOperands())
3114 for (
unsigned I = 0, E = getNumOperands();
I <
E; ++
I) {
3115 unsigned PrevCount =
Used.count();
3116 for (
unsigned K = 0; K <
E; ++K) {
3119 if (getOperand(K) ==
TE.getOperand(
I)) {
3125 if (PrevCount ==
Used.count())
3134 unsigned getVectorFactor()
const {
3135 if (!ReuseShuffleIndices.
empty())
3136 return ReuseShuffleIndices.
size();
3137 return Scalars.
size();
3141 bool isGather()
const {
return State == NeedToGather; }
3168 enum CombinedOpcode {
3170 MinMax = Instruction::OtherOpsEnd + 1,
3172 CombinedOpcode CombinedOp = NotCombinedOp;
3186 VecTreeTy &Container;
3214 assert(Operands[OpIdx].
empty() &&
"Already resized?");
3216 "Number of operands is greater than the number of scalars.");
3218 copy(OpVL, Operands[OpIdx].begin());
3222 void setOperandsInOrder() {
3225 Operands.resize(I0->getNumOperands());
3226 unsigned NumLanes = Scalars.size();
3227 for (
unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3228 OpIdx != NumOperands; ++OpIdx) {
3230 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3232 assert(
I->getNumOperands() == NumOperands &&
3233 "Expected same number of operands");
3234 Operands[OpIdx][Lane] =
I->getOperand(OpIdx);
3258 unsigned getNumOperands()
const {
return Operands.size(); }
3261 Value *getSingleOperand(
unsigned OpIdx)
const {
3263 assert(!Operands[OpIdx].
empty() &&
"No operand available");
3268 bool isAltShuffle()
const {
return MainOp != AltOp; }
3272 return (getOpcode() == CheckedOpcode ||
3273 getAltOpcode() == CheckedOpcode);
3281 if (
I && isOpcodeOrAlt(
I))
3286 void setOperations(
const InstructionsState &S) {
3300 unsigned getOpcode()
const {
3301 return MainOp ? MainOp->
getOpcode() : 0;
3304 unsigned getAltOpcode()
const {
3310 int findLaneForValue(
Value *V)
const {
3311 unsigned FoundLane = getVectorFactor();
3312 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3313 std::advance(It, 1)) {
3316 FoundLane = std::distance(Scalars.begin(), It);
3317 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3318 if (!ReorderIndices.
empty())
3319 FoundLane = ReorderIndices[FoundLane];
3320 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3321 if (ReuseShuffleIndices.
empty())
3323 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3324 RIt != ReuseShuffleIndices.
end()) {
3325 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3329 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3342 bool isNonPowOf2Vec()
const {
3344 return IsNonPowerOf2;
3353 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3354 "Reshuffling not supported with non-power-of-2 vectors yet.");
3355 return IsNonPowerOf2;
3361 dbgs() << Idx <<
".\n";
3362 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3363 dbgs() <<
"Operand " << OpI <<
":\n";
3364 for (
const Value *V : Operands[OpI])
3367 dbgs() <<
"Scalars: \n";
3368 for (
Value *V : Scalars)
3370 dbgs() <<
"State: ";
3373 dbgs() <<
"Vectorize\n";
3375 case ScatterVectorize:
3376 dbgs() <<
"ScatterVectorize\n";
3378 case StridedVectorize:
3379 dbgs() <<
"StridedVectorize\n";
3382 dbgs() <<
"NeedToGather\n";
3384 case CombinedVectorize:
3385 dbgs() <<
"CombinedVectorize\n";
3388 dbgs() <<
"MainOp: ";
3390 dbgs() << *MainOp <<
"\n";
3393 dbgs() <<
"AltOp: ";
3395 dbgs() << *AltOp <<
"\n";
3398 dbgs() <<
"VectorizedValue: ";
3399 if (VectorizedValue)
3400 dbgs() << *VectorizedValue <<
"\n";
3403 dbgs() <<
"ReuseShuffleIndices: ";
3404 if (ReuseShuffleIndices.
empty())
3407 for (
int ReuseIdx : ReuseShuffleIndices)
3408 dbgs() << ReuseIdx <<
", ";
3410 dbgs() <<
"ReorderIndices: ";
3411 for (
unsigned ReorderIdx : ReorderIndices)
3412 dbgs() << ReorderIdx <<
", ";
3414 dbgs() <<
"UserTreeIndices: ";
3415 for (
const auto &EInfo : UserTreeIndices)
3416 dbgs() << EInfo <<
", ";
3423 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3426 dbgs() <<
"SLP: " << Banner <<
":\n";
3428 dbgs() <<
"SLP: Costs:\n";
3429 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3430 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3431 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3432 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3433 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3439 std::optional<ScheduleData *> Bundle,
3440 const InstructionsState &S,
3441 const EdgeInfo &UserTreeIdx,
3444 TreeEntry::EntryState EntryState =
3445 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3446 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3447 ReuseShuffleIndices, ReorderIndices);
3451 TreeEntry::EntryState EntryState,
3452 std::optional<ScheduleData *> Bundle,
3453 const InstructionsState &S,
3454 const EdgeInfo &UserTreeIdx,
3457 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3458 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3459 "Need to vectorize gather entry?");
3461 if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
3462 EntryState == TreeEntry::NeedToGather &&
3463 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3464 !UserTreeIdx.UserTE)
3466 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3467 TreeEntry *
Last = VectorizableTree.
back().get();
3468 Last->Idx = VectorizableTree.
size() - 1;
3469 Last->State = EntryState;
3474 ReuseShuffleIndices.empty()) &&
3475 "Reshuffling scalars not yet supported for nodes with padding");
3476 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3477 ReuseShuffleIndices.end());
3478 if (ReorderIndices.
empty()) {
3480 Last->setOperations(S);
3483 Last->Scalars.assign(VL.
size(),
nullptr);
3485 [VL](
unsigned Idx) ->
Value * {
3486 if (Idx >= VL.size())
3487 return UndefValue::get(VL.front()->getType());
3491 Last->setOperations(S);
3492 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3494 if (!
Last->isGather()) {
3495 for (
Value *V : VL) {
3496 const TreeEntry *
TE = getTreeEntry(V);
3498 "Scalar already in tree!");
3501 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3504 ScalarToTreeEntry[V] =
Last;
3507 ScheduleData *BundleMember = *Bundle;
3511 "Bundle and VL out of sync");
3513 for (
Value *V : VL) {
3518 BundleMember->TE =
Last;
3519 BundleMember = BundleMember->NextInBundle;
3522 assert(!BundleMember &&
"Bundle and VL out of sync");
3525 bool AllConstsOrCasts =
true;
3529 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3530 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3531 !UserTreeIdx.UserTE->isGather())
3532 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(
Last);
3534 if (AllConstsOrCasts)
3536 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3537 MustGather.
insert(VL.begin(), VL.end());
3540 if (UserTreeIdx.UserTE)
3541 Last->UserTreeIndices.push_back(UserTreeIdx);
3547 TreeEntry::VecTreeTy VectorizableTree;
3552 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3553 VectorizableTree[
Id]->dump();
3559 TreeEntry *getTreeEntry(
Value *V) {
return ScalarToTreeEntry.lookup(V); }
3561 const TreeEntry *getTreeEntry(
Value *V)
const {
3562 return ScalarToTreeEntry.lookup(V);
3571 bool areAltOperandsProfitable(
const InstructionsState &S,
3576 TreeEntry::EntryState getScalarsVectorizationState(
3609 using ValueToGatherNodesMap =
3611 ValueToGatherNodesMap ValueToGatherNodes;
3614 constexpr static int NoGatheredLoads = -1;
3615 int GatheredLoadsEntriesFirst = NoGatheredLoads;
3618 struct ExternalUser {
3620 : Scalar(S),
User(U), Lane(
L) {}
3642 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3643 auto It = AliasCache.
find(Key);
3644 if (It != AliasCache.
end())
3649 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3653 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3685 UserList ExternalUses;
3708 struct ScheduleData {
3711 enum { InvalidDeps = -1 };
3713 ScheduleData() =
default;
3715 void init(
int BlockSchedulingRegionID,
Instruction *
I) {
3716 FirstInBundle =
this;
3717 NextInBundle =
nullptr;
3718 NextLoadStore =
nullptr;
3719 IsScheduled =
false;
3720 SchedulingRegionID = BlockSchedulingRegionID;
3721 clearDependencies();
3728 if (hasValidDependencies()) {
3729 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3731 assert(UnscheduledDeps == Dependencies &&
"invariant");
3735 assert(isSchedulingEntity() &&
3736 "unexpected scheduled state");
3737 for (
const ScheduleData *BundleMember =
this; BundleMember;
3738 BundleMember = BundleMember->NextInBundle) {
3739 assert(BundleMember->hasValidDependencies() &&
3740 BundleMember->UnscheduledDeps == 0 &&
3741 "unexpected scheduled state");
3742 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3743 "only bundle is marked scheduled");
3748 "all bundle members must be in same basic block");
3754 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3758 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3762 bool isPartOfBundle()
const {
3763 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3768 bool isReady()
const {
3769 assert(isSchedulingEntity() &&
3770 "can't consider non-scheduling entity for ready list");
3771 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3777 int incrementUnscheduledDeps(
int Incr) {
3778 assert(hasValidDependencies() &&
3779 "increment of unscheduled deps would be meaningless");
3780 UnscheduledDeps += Incr;
3781 return FirstInBundle->unscheduledDepsInBundle();
3786 void resetUnscheduledDeps() {
3787 UnscheduledDeps = Dependencies;
3791 void clearDependencies() {
3792 Dependencies = InvalidDeps;
3793 resetUnscheduledDeps();
3794 MemoryDependencies.
clear();
3795 ControlDependencies.
clear();
3798 int unscheduledDepsInBundle()
const {
3799 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3801 for (
const ScheduleData *BundleMember =
this; BundleMember;
3802 BundleMember = BundleMember->NextInBundle) {
3803 if (BundleMember->UnscheduledDeps == InvalidDeps)
3805 Sum += BundleMember->UnscheduledDeps;
3811 if (!isSchedulingEntity()) {
3812 os <<
"/ " << *Inst;
3813 }
else if (NextInBundle) {
3815 ScheduleData *SD = NextInBundle;
3817 os <<
';' << *SD->Inst;
3818 SD = SD->NextInBundle;
3829 TreeEntry *
TE =
nullptr;
3833 ScheduleData *FirstInBundle =
nullptr;
3837 ScheduleData *NextInBundle =
nullptr;
3841 ScheduleData *NextLoadStore =
nullptr;
3855 int SchedulingRegionID = 0;
3858 int SchedulingPriority = 0;
3864 int Dependencies = InvalidDeps;
3870 int UnscheduledDeps = InvalidDeps;
3874 bool IsScheduled =
false;
3879 const BoUpSLP::ScheduleData &SD) {
3904 struct BlockScheduling {
3906 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3910 ScheduleStart =
nullptr;
3911 ScheduleEnd =
nullptr;
3912 FirstLoadStoreInRegion =
nullptr;
3913 LastLoadStoreInRegion =
nullptr;
3914 RegionHasStackSave =
false;
3918 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3921 ScheduleRegionSize = 0;
3925 ++SchedulingRegionID;
3929 if (BB !=
I->getParent())
3932 ScheduleData *SD = ScheduleDataMap.lookup(
I);
3933 if (SD && isInSchedulingRegion(SD))
3938 ScheduleData *getScheduleData(
Value *V) {
3940 return getScheduleData(
I);
3944 bool isInSchedulingRegion(ScheduleData *SD)
const {
3945 return SD->SchedulingRegionID == SchedulingRegionID;
3950 template <
typename ReadyListType>
3951 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3952 SD->IsScheduled =
true;
3955 for (ScheduleData *BundleMember = SD; BundleMember;
3956 BundleMember = BundleMember->NextInBundle) {
3961 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
3962 ScheduleData *OpDef = getScheduleData(
I);
3963 if (OpDef && OpDef->hasValidDependencies() &&
3964 OpDef->incrementUnscheduledDeps(-1) == 0) {
3968 ScheduleData *DepBundle = OpDef->FirstInBundle;
3969 assert(!DepBundle->IsScheduled &&
3970 "already scheduled bundle gets ready");
3971 ReadyList.insert(DepBundle);
3973 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
3980 if (TreeEntry *TE = BundleMember->TE) {
3982 int Lane = std::distance(
TE->Scalars.begin(),
3983 find(
TE->Scalars, BundleMember->Inst));
3984 assert(Lane >= 0 &&
"Lane not set");
3992 auto *
In = BundleMember->Inst;
3996 In->getNumOperands() ==
TE->getNumOperands()) &&
3997 "Missed TreeEntry operands?");
4000 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
4001 OpIdx != NumOperands; ++OpIdx)
4007 for (
Use &U : BundleMember->Inst->operands())
4012 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4013 if (MemoryDepSD->hasValidDependencies() &&
4014 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4017 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4018 assert(!DepBundle->IsScheduled &&
4019 "already scheduled bundle gets ready");
4020 ReadyList.insert(DepBundle);
4022 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4026 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4027 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4030 ScheduleData *DepBundle = DepSD->FirstInBundle;
4031 assert(!DepBundle->IsScheduled &&
4032 "already scheduled bundle gets ready");
4033 ReadyList.insert(DepBundle);
4035 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4048 "Not a valid scheduling region?");
4050 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->
getNextNode()) {
4051 auto *SD = getScheduleData(
I);
4054 assert(isInSchedulingRegion(SD) &&
4055 "primary schedule data not in window?");
4056 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4057 "entire bundle in window!");
4061 for (
auto *SD : ReadyInsts) {
4062 assert(SD->isSchedulingEntity() && SD->isReady() &&
4063 "item in ready list not ready?");
4069 template <
typename ReadyListType>
4070 void initialFillReadyList(ReadyListType &ReadyList) {
4071 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->
getNextNode()) {
4072 ScheduleData *SD = getScheduleData(
I);
4073 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4075 ReadyList.insert(SD);
4077 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4091 std::optional<ScheduleData *>
4093 const InstructionsState &S);
4099 ScheduleData *allocateScheduleDataChunks();
4103 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4108 ScheduleData *PrevLoadStore,
4109 ScheduleData *NextLoadStore);
4113 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4117 void resetSchedule();
4147 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4151 ScheduleData *LastLoadStoreInRegion =
nullptr;
4156 bool RegionHasStackSave =
false;
4159 int ScheduleRegionSize = 0;
4168 int SchedulingRegionID = 1;
4176 void scheduleBlock(BlockScheduling *BS);
4183 struct OrdersTypeDenseMapInfo {
4196 static unsigned getHashValue(
const OrdersType &V) {
4217 unsigned MaxVecRegSize;
4218 unsigned MinVecRegSize;
4233 unsigned ReductionBitWidth = 0;
4236 unsigned BaseGraphSize = 1;
4240 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4261 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4272 return R.VectorizableTree[0].get();
4276 return {
N->UserTreeIndices.begin(),
N->Container};
4280 return {
N->UserTreeIndices.end(),
N->Container};
4307 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4318 OS << Entry->Idx <<
".\n";
4321 for (
auto *V : Entry->Scalars) {
4323 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4324 return EU.Scalar == V;
4334 if (Entry->isGather())
4336 if (Entry->State == TreeEntry::ScatterVectorize ||
4337 Entry->State == TreeEntry::StridedVectorize)
4338 return "color=blue";
4347 for (
auto *
I : DeletedInstructions) {
4348 if (!
I->getParent()) {
4353 I->insertBefore(
F->getEntryBlock(),
4354 F->getEntryBlock().getFirstNonPHIIt());
4356 I->insertBefore(
F->getEntryBlock().getTerminator());
4359 for (
Use &U :
I->operands()) {
4361 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4365 I->dropAllReferences();
4367 for (
auto *
I : DeletedInstructions) {
4369 "trying to erase instruction with users.");
4370 I->eraseFromParent();
4376#ifdef EXPENSIVE_CHECKS
4387 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4388 "Expected non-empty mask.");
4391 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
4393 Reuses[Mask[
I]] = Prev[
I];
4401 bool BottomOrder =
false) {
4402 assert(!Mask.empty() &&
"Expected non-empty mask.");
4403 unsigned Sz = Mask.size();
4406 if (Order.
empty()) {
4408 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4410 PrevOrder.
swap(Order);
4413 for (
unsigned I = 0;
I < Sz; ++
I)
4415 Order[
I] = PrevOrder[Mask[
I]];
4417 return Data.value() == Sz ||
Data.index() ==
Data.value();
4426 if (Order.
empty()) {
4428 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4438 for (
unsigned I = 0;
I < Sz; ++
I)
4440 Order[MaskOrder[
I]] =
I;
4444std::optional<BoUpSLP::OrdersType>
4446 assert(TE.isGather() &&
"Expected gather node only.");
4450 Type *ScalarTy = GatheredScalars.
front()->getType();
4451 int NumScalars = GatheredScalars.
size();
4453 return std::nullopt;
4456 if (NumParts == 0 || NumParts >= NumScalars ||
4457 VecTy->getNumElements() % NumParts != 0 ||
4459 VecTy->getNumElements() / NumParts))
4465 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4467 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4470 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4471 return std::nullopt;
4472 OrdersType CurrentOrder(NumScalars, NumScalars);
4473 if (GatherShuffles.
size() == 1 &&
4475 Entries.front().front()->isSame(TE.Scalars)) {
4478 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4479 return CurrentOrder;
4483 return all_of(Mask, [&](
int I) {
4490 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4491 (Entries.size() != 1 ||
4492 Entries.front().front()->ReorderIndices.empty())) ||
4493 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4494 return std::nullopt;
4500 if (ShuffledSubMasks.
test(
I))
4502 const int VF = GetVF(
I);
4508 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4509 std::fill(Slice.begin(), Slice.end(), NumScalars);
4510 ShuffledSubMasks.
set(
I);
4514 int FirstMin = INT_MAX;
4515 int SecondVecFound =
false;
4517 int Idx = Mask[
I * PartSz + K];
4519 Value *V = GatheredScalars[
I * PartSz + K];
4521 SecondVecFound =
true;
4530 SecondVecFound =
true;
4534 FirstMin = (FirstMin / PartSz) * PartSz;
4536 if (SecondVecFound) {
4537 std::fill(Slice.begin(), Slice.end(), NumScalars);
4538 ShuffledSubMasks.
set(
I);
4542 int Idx = Mask[
I * PartSz + K];
4546 if (Idx >= PartSz) {
4547 SecondVecFound =
true;
4550 if (CurrentOrder[
I * PartSz + Idx] >
4551 static_cast<unsigned>(
I * PartSz + K) &&
4552 CurrentOrder[
I * PartSz + Idx] !=
4553 static_cast<unsigned>(
I * PartSz + Idx))
4554 CurrentOrder[
I * PartSz + Idx] =
I * PartSz + K;
4557 if (SecondVecFound) {
4558 std::fill(Slice.begin(), Slice.end(), NumScalars);
4559 ShuffledSubMasks.
set(
I);
4565 if (!ExtractShuffles.
empty())
4566 TransformMaskToOrder(
4567 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4568 if (!ExtractShuffles[
I])
4571 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4573 int K =
I * PartSz + Idx;
4576 if (!TE.ReuseShuffleIndices.empty())
4577 K = TE.ReuseShuffleIndices[K];
4578 if (!TE.ReorderIndices.empty())
4579 K = std::distance(TE.ReorderIndices.begin(),
4580 find(TE.ReorderIndices, K));
4586 .getKnownMinValue());
4591 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4592 if (ShuffledSubMasks.
any())
4593 return std::nullopt;
4594 PartSz = NumScalars;
4597 if (!Entries.empty())
4598 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4599 if (!GatherShuffles[
I])
4601 return std::max(Entries[
I].front()->getVectorFactor(),
4602 Entries[
I].back()->getVectorFactor());
4605 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4606 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4607 return std::nullopt;
4608 return std::move(CurrentOrder);
4613 bool CompareOpcodes =
true) {
4622 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4626 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4631template <
typename T>
4636 return CommonAlignment;
4641 unsigned Sz = Order.
size();
4643 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4654static std::optional<Value *>
4660 const SCEV *PtrSCEVLowest =
nullptr;
4661 const SCEV *PtrSCEVHighest =
nullptr;
4667 return std::nullopt;
4669 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4670 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4675 return std::nullopt;
4677 PtrSCEVLowest = PtrSCEV;
4682 return std::nullopt;
4684 PtrSCEVHighest = PtrSCEV;
4691 return std::nullopt;
4692 int Size =
DL.getTypeStoreSize(ElemTy);
4693 auto TryGetStride = [&](
const SCEV *Dist,
4694 const SCEV *Multiplier) ->
const SCEV * {
4696 if (M->getOperand(0) == Multiplier)
4697 return M->getOperand(1);
4698 if (M->getOperand(1) == Multiplier)
4699 return M->getOperand(0);
4702 if (Multiplier == Dist)
4707 const SCEV *Stride =
nullptr;
4708 if (
Size != 1 || SCEVs.
size() > 2) {
4710 Stride = TryGetStride(Dist, Sz);
4712 return std::nullopt;
4715 return std::nullopt;
4718 using DistOrdPair = std::pair<int64_t, int>;
4720 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4722 bool IsConsecutive =
true;
4723 for (
const SCEV *PtrSCEV : SCEVs) {
4725 if (PtrSCEV != PtrSCEVLowest) {
4727 const SCEV *Coeff = TryGetStride(Diff, Stride);
4729 return std::nullopt;
4732 return std::nullopt;
4736 return std::nullopt;
4737 Dist = SC->getAPInt().getZExtValue();
4741 return std::nullopt;
4742 auto Res = Offsets.emplace(Dist, Cnt);
4744 return std::nullopt;
4746 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4749 if (Offsets.size() != SCEVs.
size())
4750 return std::nullopt;
4751 SortedIndices.
clear();
4752 if (!IsConsecutive) {
4756 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4757 SortedIndices[Cnt] = Pair.second;
4767static std::pair<InstructionCost, InstructionCost>
4783 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4786 Mask, NumSrcElts, NumSubElts,
Index)) {
4787 if (
Index + NumSubElts > NumSrcElts &&
4788 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4801 unsigned *BestVF,
bool TryRecursiveCheck)
const {
4814 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
4820 const unsigned Sz = VL.
size();
4822 auto *POIter = PointerOps.
begin();
4823 for (
Value *V : VL) {
4827 *POIter = L->getPointerOperand();
4856 if (Order.
empty()) {
4857 Ptr0 = PointerOps.
front();
4858 PtrN = PointerOps.
back();
4860 Ptr0 = PointerOps[Order.
front()];
4861 PtrN = PointerOps[Order.
back()];
4863 std::optional<int> Diff =
4866 if (
static_cast<unsigned>(*Diff) == Sz - 1)
4872 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4886 auto IsAnyPointerUsedOutGraph =
4887 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
4889 return !getTreeEntry(U) && !MustGather.contains(U);
4892 const unsigned AbsoluteDiff = std::abs(*Diff);
4893 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4897 AbsoluteDiff > Sz) ||
4898 *Diff == -(
static_cast<int>(Sz) - 1))) {
4899 int Stride = *Diff /
static_cast<int>(Sz - 1);
4900 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
4912 else if (
Ptr != Ptr0)
4916 if (((Dist / Stride) * Stride) != Dist ||
4917 !Dists.
insert(Dist).second)
4920 if (Dists.
size() == Sz)
4929 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
4931 bool ProfitableGatherPointers) {
4936 auto [ScalarGEPCost, VectorGEPCost] =
4938 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
4944 VecTy->getNumElements());
4945 if (
static_cast<unsigned>(
count_if(
4952 PtrVecTy, DemandedElts,
true,
false,
CostKind);
4971 false, CommonAlignment,
CostKind) +
4972 (ProfitableGatherPointers ? 0 : VectorGEPCost);
4979 constexpr unsigned ListLimit = 4;
4980 if (!TryRecursiveCheck || VL.
size() < ListLimit)
4989 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
4994 for (
unsigned VF = VL.
size() / 2; VF >= MinVF; VF /= 2) {
4996 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5009 DemandedElts.
setBits(Cnt, Cnt + VF);
5024 if (!DemandedElts.
isZero()) {
5030 if (DemandedElts[Idx])
5041 LI0->getPointerOperand(),
5042 Instruction::GetElementPtr,
CostKind, ScalarTy,
5046 if (
static_cast<unsigned>(
5048 PointerOps.
size() - 1 ||
5067 LI0->getPointerAddressSpace(),
CostKind,
5073 LI0->getPointerOperand(),
5080 LI0->getPointerOperand(),
5091 ShuffleMask[Idx] = Idx / VF ==
I ? VL.
size() + Idx % VF : Idx;
5100 if (MaskedGatherCost >= VecLdCost &&
5113 bool ProfitableGatherPointers =
5114 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5115 return L->isLoopInvariant(V);
5117 if (ProfitableGatherPointers ||
all_of(PointerOps, [IsSorted](
Value *
P) {
5120 (
GEP &&
GEP->getNumOperands() == 2 &&
5128 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5129 ProfitableGatherPointers))
5140 VL, [](
const Value *V) {
return V->getType()->isPointerTy(); }) &&
5141 "Expected list of pointer operands.");
5146 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
5151 std::optional<int> Diff =
5157 Base.second.emplace_back(
Ptr, *Diff, Cnt++);
5163 if (Bases.size() > VL.
size() / 2 - 1)
5167 Bases[
Ptr].emplace_back(
Ptr, 0, Cnt++);
5173 bool AnyConsecutive =
false;
5174 for (
auto &
Base : Bases) {
5175 auto &Vec =
Base.second;
5176 if (Vec.size() > 1) {
5178 const std::tuple<Value *, int, unsigned> &
Y) {
5179 return std::get<1>(
X) < std::get<1>(
Y);
5181 int InitialOffset = std::get<1>(Vec[0]);
5183 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5189 SortedIndices.
clear();
5190 if (!AnyConsecutive)
5198 for (
auto &
Base : Bases) {
5200 Value *Root = Strip;
5202 Root = Gep->getOperand(0);
5205 auto *Begin = SortedBases.
begin();
5206 auto *
End = SortedBases.
end();
5207 while (Begin !=
End) {
5208 Value *Root = std::get<2>(*Begin);
5209 auto *Mid = std::stable_partition(
5210 Begin,
End, [&Root](
auto V) {
return std::get<2>(V) == Root; });
5212 for (
auto *
I = Begin;
I < Mid; ++
I)
5213 LessThan.try_emplace(std::get<1>(*
I));
5214 for (
auto *
I = Begin;
I < Mid; ++
I) {
5215 Value *V = std::get<1>(*
I);
5217 V = Gep->getOperand(0);
5218 if (LessThan.contains(V))
5219 LessThan[V][std::get<1>(*
I)] =
true;
5222 std::stable_sort(Begin, Mid, [&LessThan](
auto &V1,
auto &V2) {
5223 return LessThan[std::get<1>(V1)][std::get<1>(V2)];
5229 for (
auto Base : SortedBases)
5230 for (
auto &
T : Bases[std::get<0>(
Base)])
5234 "Expected SortedIndices to be the size of VL");
5238std::optional<BoUpSLP::OrdersType>
5240 assert(TE.isGather() &&
"Expected gather node only.");
5241 Type *ScalarTy = TE.Scalars[0]->getType();
5244 Ptrs.
reserve(TE.Scalars.size());
5245 for (
Value *V : TE.Scalars) {
5247 if (!L || !L->isSimple())
5248 return std::nullopt;
5254 return std::move(Order);
5255 return std::nullopt;
5266 if (VU->
getType() != V->getType())
5269 if (!VU->
hasOneUse() && !V->hasOneUse())
5275 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5282 bool IsReusedIdx =
false;
5284 if (IE2 == VU && !IE1)
5286 if (IE1 == V && !IE2)
5287 return V->hasOneUse();
5288 if (IE1 && IE1 != V) {
5290 IsReusedIdx |= ReusedIdx.
test(Idx1);
5291 ReusedIdx.
set(Idx1);
5292 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5297 if (IE2 && IE2 != VU) {
5299 IsReusedIdx |= ReusedIdx.
test(Idx2);
5300 ReusedIdx.
set(Idx2);
5301 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5306 }
while (!IsReusedIdx && (IE1 || IE2));
5310std::optional<BoUpSLP::OrdersType>
5314 if (!TE.ReuseShuffleIndices.empty()) {
5316 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5317 "Reshuffling scalars not yet supported for nodes with padding");
5320 return std::nullopt;
5328 unsigned Sz = TE.Scalars.size();
5329 if (TE.isGather()) {
5330 if (std::optional<OrdersType> CurrentOrder =
5335 ::addMask(Mask, TE.ReuseShuffleIndices);
5336 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5337 unsigned Sz = TE.Scalars.size();
5338 for (
int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5341 Res[Idx + K * Sz] =
I + K * Sz;
5343 return std::move(Res);
5346 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5348 2 * TE.getVectorFactor())) == 1)
5349 return std::nullopt;
5353 if (TE.ReorderIndices.empty())
5354 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5357 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5358 unsigned VF = ReorderMask.
size();
5362 for (
unsigned I = 0;
I < VF;
I += Sz) {
5364 unsigned UndefCnt = 0;
5365 unsigned Limit = std::min(Sz, VF -
I);
5374 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5376 return std::nullopt;
5378 for (
unsigned K = 0; K < NumParts; ++K) {
5379 unsigned Idx = Val + Sz * K;
5381 ResOrder[Idx] =
I + K;
5384 return std::move(ResOrder);
5386 unsigned VF = TE.getVectorFactor();
5389 TE.ReuseShuffleIndices.end());
5390 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5392 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5393 return Idx && *Idx < Sz;
5396 if (TE.ReorderIndices.empty())
5397 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5400 for (
unsigned I = 0;
I < VF; ++
I) {
5401 int &Idx = ReusedMask[
I];
5404 Value *V = TE.Scalars[ReorderMask[Idx]];
5406 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5412 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5413 auto *It = ResOrder.
begin();
5414 for (
unsigned K = 0; K < VF; K += Sz) {
5418 std::iota(SubMask.begin(), SubMask.end(), 0);
5420 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5421 std::advance(It, Sz);
5424 return Data.index() ==
Data.value();
5426 return std::nullopt;
5427 return std::move(ResOrder);
5429 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5430 any_of(TE.UserTreeIndices,
5432 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5434 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5435 return std::nullopt;
5436 if ((TE.State == TreeEntry::Vectorize ||
5437 TE.State == TreeEntry::StridedVectorize) &&
5441 return TE.ReorderIndices;
5442 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5443 if (!TE.ReorderIndices.empty())
5444 return TE.ReorderIndices;
5446 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5447 Value *V1 = TE.Scalars[I1];
5448 Value *V2 = TE.Scalars[I2];
5449 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5467 if (EE1->getOperand(0) != EE2->getOperand(0))
5475 std::iota(Phis.
begin(), Phis.
end(), 0);
5477 for (
unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5480 for (
unsigned Id = 0, Sz = Phis.
size(); Id < Sz; ++Id)
5481 ResOrder[Id] = PhiToId[Phis[Id]];
5483 return std::nullopt;
5484 return std::move(ResOrder);
5486 if (TE.isGather() && !TE.isAltShuffle() &&
allSameType(TE.Scalars)) {
5489 if ((TE.getOpcode() == Instruction::ExtractElement ||
5493 auto *EE = dyn_cast<ExtractElementInst>(V);
5494 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5499 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5501 if (Reuse || !CurrentOrder.
empty())
5502 return std::move(CurrentOrder);
5510 int Sz = TE.Scalars.size();
5514 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5515 if (It == TE.Scalars.begin())
5518 if (It != TE.Scalars.end()) {
5520 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5535 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5538 return std::move(Order);
5543 return std::nullopt;
5544 if (TE.Scalars.size() >= 3)
5549 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5553 CurrentOrder, PointerOps);
5555 return std::move(CurrentOrder);
5559 if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI))
5561 return CurrentOrder;
5563 return std::nullopt;
5573 for (
unsigned I = Sz, E = Mask.size();
I < E;
I += Sz) {
5575 if (Cluster != FirstCluster)
5581void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5584 const unsigned Sz =
TE.Scalars.size();
5586 if (!
TE.isGather() ||
5593 addMask(NewMask,
TE.ReuseShuffleIndices);
5595 TE.ReorderIndices.clear();
5602 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5603 *
End =
TE.ReuseShuffleIndices.end();
5604 It !=
End; std::advance(It, Sz))
5605 std::iota(It, std::next(It, Sz), 0);
5611 "Expected same size of orders");
5612 unsigned Sz = Order.
size();
5615 if (Order[Idx] != Sz)
5616 UsedIndices.
set(Order[Idx]);
5618 if (SecondaryOrder.
empty()) {
5620 if (Order[Idx] == Sz && !UsedIndices.
test(Idx))
5624 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5625 !UsedIndices.
test(SecondaryOrder[Idx]))
5626 Order[Idx] = SecondaryOrder[Idx];
5646 ExternalUserReorderMap;
5651 const std::unique_ptr<TreeEntry> &TE) {
5654 findExternalStoreUsersReorderIndices(TE.get());
5655 if (!ExternalUserReorderIndices.
empty()) {
5656 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5658 std::move(ExternalUserReorderIndices));
5664 if (TE->isAltShuffle()) {
5667 unsigned Opcode0 = TE->getOpcode();
5668 unsigned Opcode1 = TE->getAltOpcode();
5671 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5672 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5678 if (std::optional<OrdersType> CurrentOrder =
5688 const TreeEntry *UserTE = TE.get();
5690 if (UserTE->UserTreeIndices.size() != 1)
5693 return EI.UserTE->State == TreeEntry::Vectorize &&
5694 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5697 UserTE = UserTE->UserTreeIndices.back().UserTE;
5700 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5701 if (!(TE->State == TreeEntry::Vectorize ||
5702 TE->State == TreeEntry::StridedVectorize) ||
5703 !TE->ReuseShuffleIndices.empty())
5704 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5705 if (TE->State == TreeEntry::Vectorize &&
5706 TE->getOpcode() == Instruction::PHI)
5707 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
5712 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
5713 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5714 auto It = VFToOrderedEntries.
find(VF);
5715 if (It == VFToOrderedEntries.
end())
5730 for (
const TreeEntry *OpTE : OrderedEntries) {
5733 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
5736 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5738 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5739 auto It = GathersToOrders.find(OpTE);
5740 if (It != GathersToOrders.end())
5743 if (OpTE->isAltShuffle()) {
5744 auto It = AltShufflesToOrders.find(OpTE);
5745 if (It != AltShufflesToOrders.end())
5748 if (OpTE->State == TreeEntry::Vectorize &&
5749 OpTE->getOpcode() == Instruction::PHI) {
5750 auto It = PhisToOrders.
find(OpTE);
5751 if (It != PhisToOrders.
end())
5754 return OpTE->ReorderIndices;
5757 auto It = ExternalUserReorderMap.
find(OpTE);
5758 if (It != ExternalUserReorderMap.
end()) {
5759 const auto &ExternalUserReorderIndices = It->second;
5763 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5764 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
5765 ExternalUserReorderIndices.size();
5767 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
5768 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5775 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5776 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5779 unsigned E = Order.size();
5782 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5785 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5787 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5790 if (OrdersUses.empty())
5793 unsigned IdentityCnt = 0;
5794 unsigned FilledIdentityCnt = 0;
5796 for (
auto &Pair : OrdersUses) {
5798 if (!Pair.first.empty())
5799 FilledIdentityCnt += Pair.second;
5800 IdentityCnt += Pair.second;
5805 unsigned Cnt = IdentityCnt;
5806 for (
auto &Pair : OrdersUses) {
5810 if (Cnt < Pair.second ||
5811 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5812 Cnt == Pair.second && !BestOrder.
empty() &&
5815 BestOrder = Pair.first;
5828 unsigned E = BestOrder.
size();
5830 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5833 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5835 if (TE->Scalars.size() != VF) {
5836 if (TE->ReuseShuffleIndices.size() == VF) {
5842 return EI.UserTE->Scalars.size() == VF ||
5843 EI.UserTE->Scalars.size() ==
5846 "All users must be of VF size.");
5849 reorderNodeWithReuses(*TE, Mask);
5853 if ((TE->State == TreeEntry::Vectorize ||
5854 TE->State == TreeEntry::StridedVectorize) &&
5857 !TE->isAltShuffle()) {
5862 TE->reorderOperands(Mask);
5865 TE->reorderOperands(Mask);
5866 assert(TE->ReorderIndices.empty() &&
5867 "Expected empty reorder sequence.");
5870 if (!TE->ReuseShuffleIndices.empty()) {
5877 addMask(NewReuses, TE->ReuseShuffleIndices);
5878 TE->ReuseShuffleIndices.swap(NewReuses);
5884bool BoUpSLP::canReorderOperands(
5885 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5888 for (
unsigned I = 0, E = UserTE->getNumOperands();
I < E; ++
I) {
5889 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
5890 return OpData.first ==
I &&
5891 (OpData.second->State == TreeEntry::Vectorize ||
5892 OpData.second->State == TreeEntry::StridedVectorize);
5895 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
5897 if (
any_of(TE->UserTreeIndices,
5898 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5902 Edges.emplace_back(
I, TE);
5908 if (TE->State != TreeEntry::Vectorize &&
5909 TE->State != TreeEntry::StridedVectorize &&
5910 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5914 TreeEntry *
Gather =
nullptr;
5916 [&
Gather, UserTE,
I](TreeEntry *TE) {
5917 assert(TE->State != TreeEntry::Vectorize &&
5918 TE->State != TreeEntry::StridedVectorize &&
5919 "Only non-vectorized nodes are expected.");
5920 if (
any_of(TE->UserTreeIndices,
5921 [UserTE,
I](
const EdgeInfo &EI) {
5922 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5924 assert(TE->isSame(UserTE->getOperand(
I)) &&
5925 "Operand entry does not match operands.");
5946 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5947 if (TE->State != TreeEntry::Vectorize &&
5948 TE->State != TreeEntry::StridedVectorize)
5950 if (std::optional<OrdersType> CurrentOrder =
5952 OrderedEntries.
insert(TE.get());
5953 if (!(TE->State == TreeEntry::Vectorize ||
5954 TE->State == TreeEntry::StridedVectorize) ||
5955 !TE->ReuseShuffleIndices.empty())
5956 GathersToOrders.
insert(TE.get());
5965 while (!OrderedEntries.
empty()) {
5970 for (TreeEntry *TE : OrderedEntries) {
5971 if (!(TE->State == TreeEntry::Vectorize ||
5972 TE->State == TreeEntry::StridedVectorize ||
5973 (TE->isGather() && GathersToOrders.
contains(TE))) ||
5974 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5977 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5979 !Visited.
insert(TE).second) {
5985 for (
EdgeInfo &EI : TE->UserTreeIndices)
5989 for (TreeEntry *TE : Filtered)
5990 OrderedEntries.remove(TE);
5992 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5994 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
5995 return Data1.first->Idx > Data2.first->Idx;
5997 for (
auto &
Data : UsersVec) {
6000 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6002 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6003 OrderedEntries.remove(
Op.second);
6016 for (
const auto &
Op :
Data.second) {
6017 TreeEntry *OpTE =
Op.second;
6018 if (!VisitedOps.
insert(OpTE).second)
6020 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6022 const auto Order = [&]() ->
const OrdersType {
6023 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6026 return OpTE->ReorderIndices;
6030 if (Order.size() == 1)
6033 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6034 return P.second == OpTE;
6037 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
6038 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6041 unsigned E = Order.size();
6044 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6047 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6050 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6052 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6053 const auto AllowsReordering = [&](
const TreeEntry *TE) {
6054 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6055 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6056 (IgnoreReorder && TE->Idx == 0))
6058 if (TE->isGather()) {
6067 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
6068 TreeEntry *UserTE = EI.
UserTE;
6069 if (!VisitedUsers.
insert(UserTE).second)
6074 if (AllowsReordering(UserTE))
6082 if (
static_cast<unsigned>(
count_if(
6083 Ops, [UserTE, &AllowsReordering](
6084 const std::pair<unsigned, TreeEntry *> &
Op) {
6085 return AllowsReordering(
Op.second) &&
6088 return EI.UserTE == UserTE;
6090 })) <= Ops.
size() / 2)
6091 ++Res.first->second;
6094 if (OrdersUses.empty()) {
6095 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6096 OrderedEntries.remove(
Op.second);
6100 unsigned IdentityCnt = 0;
6101 unsigned VF =
Data.second.front().second->getVectorFactor();
6103 for (
auto &Pair : OrdersUses) {
6105 IdentityCnt += Pair.second;
6110 unsigned Cnt = IdentityCnt;
6111 for (
auto &Pair : OrdersUses) {
6115 if (Cnt < Pair.second) {
6117 BestOrder = Pair.first;
6125 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6126 OrderedEntries.remove(
Op.second);
6135 unsigned E = BestOrder.
size();
6137 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6139 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6140 TreeEntry *TE =
Op.second;
6141 OrderedEntries.remove(TE);
6142 if (!VisitedOps.
insert(TE).second)
6144 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6145 reorderNodeWithReuses(*TE, Mask);
6149 if (TE->State != TreeEntry::Vectorize &&
6150 TE->State != TreeEntry::StridedVectorize &&
6151 (TE->State != TreeEntry::ScatterVectorize ||
6152 TE->ReorderIndices.empty()))
6154 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6155 TE->ReorderIndices.empty()) &&
6156 "Non-matching sizes of user/operand entries.");
6158 if (IgnoreReorder && TE == VectorizableTree.front().get())
6159 IgnoreReorder =
false;
6162 for (TreeEntry *
Gather : GatherOps) {
6164 "Unexpected reordering of gathers.");
6165 if (!
Gather->ReuseShuffleIndices.empty()) {
6171 OrderedEntries.remove(
Gather);
6175 if (
Data.first->State != TreeEntry::Vectorize ||
6177 Data.first->getMainOp()) ||
6178 Data.first->isAltShuffle())
6179 Data.first->reorderOperands(Mask);
6181 Data.first->isAltShuffle() ||
6182 Data.first->State == TreeEntry::StridedVectorize) {
6186 if (
Data.first->ReuseShuffleIndices.empty() &&
6187 !
Data.first->ReorderIndices.empty() &&
6188 !
Data.first->isAltShuffle()) {
6191 OrderedEntries.insert(
Data.first);
6199 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6200 VectorizableTree.front()->ReuseShuffleIndices.empty())
6201 VectorizableTree.front()->ReorderIndices.
clear();
6204Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
6205 if ((Entry.getOpcode() == Instruction::Store ||
6206 Entry.getOpcode() == Instruction::Load) &&
6207 Entry.State == TreeEntry::StridedVectorize &&
6208 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6217 for (
auto &TEPtr : VectorizableTree) {
6218 TreeEntry *Entry = TEPtr.get();
6221 if (Entry->isGather())
6225 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6226 Value *Scalar = Entry->Scalars[Lane];
6230 auto It = ScalarToExtUses.
find(Scalar);
6231 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6235 const auto *ExtI = ExternallyUsedValues.
find(Scalar);
6236 if (ExtI != ExternallyUsedValues.
end()) {
6237 int FoundLane = Entry->findLaneForValue(Scalar);
6238 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6239 << FoundLane <<
" from " << *Scalar <<
".\n");
6240 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6241 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6244 for (
User *U : Scalar->users()) {
6252 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6256 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6260 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6262 Scalar, getRootEntryInstruction(*UseEntry), TLI)) {
6263 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6265 assert(!UseEntry->isGather() &&
"Bad state");
6269 if (It != ScalarToExtUses.
end()) {
6270 ExternalUses[It->second].User =
nullptr;
6275 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6277 int FoundLane = Entry->findLaneForValue(Scalar);
6279 <<
" from lane " << FoundLane <<
" from " << *Scalar
6281 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6282 ExternalUses.emplace_back(Scalar, U, FoundLane);
6291BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6293 for (
unsigned Lane :
seq<unsigned>(0, TE->Scalars.size())) {
6294 Value *V = TE->Scalars[Lane];
6307 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6311 if (getTreeEntry(U))
6315 auto &StoresVec = PtrToStoresMap[
Ptr];
6318 if (StoresVec.size() > Lane)
6321 if (!StoresVec.empty() &&
6322 SI->getParent() != StoresVec.back()->getParent())
6325 if (!StoresVec.empty() &&
6326 SI->getValueOperand()->getType() !=
6327 StoresVec.back()->getValueOperand()->getType())
6329 StoresVec.push_back(SI);
6332 return PtrToStoresMap;
6336 OrdersType &ReorderIndices)
const {
6344 StoreOffsetVec[0] = {S0, 0};
6349 std::optional<int> Diff =
6351 SI->getPointerOperand(), *
DL, *SE,
6356 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
6361 stable_sort(StoreOffsetVec, [](
const std::pair<StoreInst *, int> &Pair1,
6362 const std::pair<StoreInst *, int> &Pair2) {
6363 int Offset1 = Pair1.second;
6364 int Offset2 = Pair2.second;
6365 return Offset1 < Offset2;
6369 for (
unsigned Idx :
seq<unsigned>(1, StoreOffsetVec.size()))
6370 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
6375 ReorderIndices.reserve(StoresVec.
size());
6377 unsigned Idx =
find_if(StoreOffsetVec,
6378 [SI](
const std::pair<StoreInst *, int> &Pair) {
6379 return Pair.first ==
SI;
6381 StoreOffsetVec.begin();
6382 ReorderIndices.push_back(Idx);
6388 ReorderIndices.clear();
6395 for (
unsigned Idx : Order)
6396 dbgs() << Idx <<
", ";
6402BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6403 unsigned NumLanes =
TE->Scalars.size();
6406 collectUserStores(TE);
6415 for (
const auto &Pair : PtrToStoresMap) {
6416 auto &StoresVec = Pair.second;
6418 if (StoresVec.size() != NumLanes)
6423 if (!canFormVector(StoresVec, ReorderIndices))
6428 ExternalReorderIndices.
push_back(ReorderIndices);
6430 return ExternalReorderIndices;
6436 UserIgnoreList = &UserIgnoreLst;
6439 buildTree_rec(Roots, 0,
EdgeInfo());
6446 buildTree_rec(Roots, 0,
EdgeInfo());
6455 bool AddNew =
true) {
6461 const int NumScalars = VL.
size();
6463 if (NumScalars > 1) {
6466 if (NumParts == 0 || NumParts >= NumScalars ||
6467 VecTy->getNumElements() % NumParts != 0 ||
6469 VecTy->getNumElements() / NumParts))
6476 VL.
slice(
I * VF, std::min<unsigned>(VF, VL.
size() -
I * VF))) {
6480 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6482 bool IsFound =
false;
6483 for (
auto &
Data : ClusteredLoads) {
6484 if (LI->getParent() !=
Data.front().first->getParent())
6486 std::optional<int> Dist =
6488 Data.front().first->getType(),
6489 Data.front().first->getPointerOperand(),
DL, SE,
6491 if (Dist &&
all_of(
Data, [&](
const std::pair<LoadInst *, int> &Pair) {
6492 IsFound |= Pair.first == LI;
6493 return IsFound || Pair.second != *Dist;
6496 Data.emplace_back(LI, *Dist);
6502 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6505 auto FindMatchingLoads =
6510 int &
Offset,
unsigned &Start) {
6512 return GatheredLoads.
end();
6521 std::optional<int> Dist =
6523 Data.front().first->getType(),
6524 Data.front().first->getPointerOperand(),
DL, SE,
6529 unsigned NumUniques = 0;
6530 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6532 Data, [&, &
P = Pair](
const std::pair<LoadInst *, int> &PD) {
6533 return PD.first ==
P.first;
6537 [&, &
P = Pair](
const std::pair<LoadInst *, int> &PD) {
6538 return *Dist +
P.second == PD.second;
6543 Repeated.insert(Cnt);
6546 if (NumUniques > 0 &&
6547 (Loads.
size() == NumUniques ||
6548 (Loads.
size() - NumUniques >= 2 &&
6549 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6555 return std::next(GatheredLoads.
begin(), Idx);
6560 return GatheredLoads.
end();
6562 for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6566 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6568 while (It != GatheredLoads.
end()) {
6569 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6570 for (
unsigned Idx : LocalToAdd)
6572 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6573 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6577 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6584 Loads.push_back(
Data[Idx]);
6590 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6591 return PD.front().first->getParent() == LI->
getParent() &&
6592 PD.front().first->getType() == LI->
getType();
6594 while (It != GatheredLoads.
end()) {
6597 std::next(It), GatheredLoads.
end(),
6598 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6599 return PD.front().first->getParent() == LI->getParent() &&
6600 PD.front().first->getType() == LI->getType();
6604 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6605 AddNewLoads(GatheredLoads.emplace_back());
6610void BoUpSLP::tryToVectorizeGatheredLoads(
6612 GatheredLoadsEntriesFirst = VectorizableTree.size();
6615 auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6616 const std::pair<LoadInst *, int> &L2) {
6617 return L1.second > L2.second;
6624 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
6632 bool Final,
unsigned MaxVF) {
6634 unsigned StartIdx = 0;
6638 for (
int NumElts =
bit_floor(MaxVF); NumElts > 1; NumElts /= 2) {
6644 if (Final && CandidateVFs.
empty())
6647 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
6648 for (
unsigned NumElts : CandidateVFs) {
6649 if (Final && NumElts > BestVF)
6652 for (
unsigned Cnt = StartIdx, E = Loads.
size(); Cnt + NumElts <= E;
6655 if (VectorizedLoads.count(Slice.front()) ||
6656 VectorizedLoads.count(Slice.back()) ||
6662 bool AllowToVectorize =
6664 any_of(ValueToGatherNodes.at(Slice.front()),
6665 [=](
const TreeEntry *TE) {
6666 return TE->Scalars.size() == 2 &&
6667 ((TE->Scalars.front() == Slice.front() &&
6668 TE->Scalars.back() == Slice.back()) ||
6669 (TE->Scalars.front() == Slice.back() &&
6670 TE->Scalars.back() == Slice.front()));
6679 if (LI->hasOneUse())
6685 if (std::distance(LI->user_begin(), LI->user_end()) !=
6688 if (!IsLegalBroadcastLoad)
6692 for (
User *U : LI->users()) {
6695 if (
const TreeEntry *UTE = getTreeEntry(U)) {
6696 for (
int I :
seq<int>(UTE->getNumOperands())) {
6697 if (
all_of(UTE->getOperand(
I),
6698 [LI](
Value *V) { return V == LI; }))
6707 AllowToVectorize = CheckIfAllowed(Slice);
6709 if (AllowToVectorize) {
6714 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
6716 PointerOps, &BestVF);
6718 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
6720 if (MaskedGatherVectorized.
empty() ||
6721 Cnt >= MaskedGatherVectorized.
back() + NumElts)
6726 Results.emplace_back(Values, LS);
6727 VectorizedLoads.insert(Slice.begin(), Slice.end());
6730 if (Cnt == StartIdx)
6731 StartIdx += NumElts;
6734 if (StartIdx >= Loads.
size())
6738 if (!MaskedGatherVectorized.
empty() &&
6739 Cnt < MaskedGatherVectorized.
back() + NumElts)
6745 if (!AllowToVectorize || BestVF == 0)
6749 for (
unsigned Cnt : MaskedGatherVectorized) {
6752 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
6754 VectorizedLoads.insert(Slice.begin(), Slice.end());
6756 if (Cnt == StartIdx)
6757 StartIdx += NumElts;
6761 if (!VectorizedLoads.contains(LI))
6762 NonVectorized.push_back(LI);
6766 auto ProcessGatheredLoads =
6768 bool Final =
false) {
6770 for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
6771 if (LoadsDists.size() <= 1) {
6772 NonVectorized.
push_back(LoadsDists.back().first);
6778 LoadsDists, OriginalLoads.begin(),
6779 [](
const std::pair<LoadInst *, int> &L) { return L.first; });
6782 unsigned MaxConsecutiveDistance = 0;
6783 unsigned CurrentConsecutiveDist = 1;
6784 int LastDist = LocalLoadsDists.
front().second;
6785 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
6786 for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
6787 if (getTreeEntry(
L.first))
6789 assert(LastDist >=
L.second &&
6790 "Expected first distance always not less than second");
6791 if (
static_cast<unsigned>(LastDist -
L.second) ==
6792 CurrentConsecutiveDist) {
6793 ++CurrentConsecutiveDist;
6794 MaxConsecutiveDistance =
6795 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
6799 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
6802 CurrentConsecutiveDist = 1;
6803 LastDist =
L.second;
6806 if (Loads.
size() <= 1)
6808 if (AllowMaskedGather)
6809 MaxConsecutiveDistance = Loads.
size();
6810 else if (MaxConsecutiveDistance < 2)
6815 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
6816 Final, MaxConsecutiveDistance);
6818 OriginalLoads.size() == Loads.
size() &&
6819 MaxConsecutiveDistance == Loads.
size() &&
6824 VectorizedLoads.
clear();
6828 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
6829 UnsortedNonVectorized, Final,
6830 OriginalLoads.size());
6831 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
6832 SortedNonVectorized.
swap(UnsortedNonVectorized);
6833 Results.swap(UnsortedResults);
6838 << Slice.size() <<
")\n");
6839 if (
any_of(Slice, [&](
Value *V) {
return getTreeEntry(V); })) {
6840 for (
Value *L : Slice)
6841 if (!getTreeEntry(L))
6848 unsigned MaxVF = Slice.size();
6849 unsigned UserMaxVF = 0;
6853 std::optional<unsigned> CommonVF = 0;
6855 for (
auto [Idx, V] :
enumerate(Slice)) {
6856 for (
const TreeEntry *E : ValueToGatherNodes.at(V)) {
6857 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
6859 EntryToPosition.
try_emplace(E, Idx).first->second;
6860 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
6862 if (*CommonVF == 0) {
6863 CommonVF = E->Scalars.size();
6866 if (*CommonVF != E->Scalars.size())
6874 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
6875 bool IsVectorized =
true;
6876 for (
unsigned I = 0, E = Slice.size();
I < E;
I += VF) {
6878 Slice.
slice(
I, std::min(VF, E -
I));
6879 if (getTreeEntry(SubSlice.
front()))
6881 unsigned Sz = VectorizableTree.size();
6882 buildTree_rec(SubSlice, 0, EdgeInfo());
6883 if (Sz == VectorizableTree.size()) {
6884 IsVectorized =
false;
6892 NonVectorized.
append(SortedNonVectorized);
6894 return NonVectorized;
6897 if (!GatheredLoads.empty() && !NonVectorized.
empty() &&
6899 GatheredLoads.begin(), GatheredLoads.end(), 0u,
6900 [](
unsigned S,
ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
6901 return S + LoadsDists.size();
6902 }) != NonVectorized.
size() &&
6903 IsMaskedGatherSupported(NonVectorized)) {
6905 for (
LoadInst *LI : NonVectorized) {
6913 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
6917 if (
static_cast<unsigned>(GatheredLoadsEntriesFirst) ==
6918 VectorizableTree.size())
6919 GatheredLoadsEntriesFirst = NoGatheredLoads;
6926 Value *NeedsScheduling =
nullptr;
6927 for (
Value *V : VL) {
6930 if (!NeedsScheduling) {
6931 NeedsScheduling = V;
6936 return NeedsScheduling;
6947 bool AllowAlternate) {
6954 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
6982 std::pair<size_t, size_t> OpVals =
6990 if (CI->isCommutative())
7012 SubKey =
hash_value(Gep->getPointerOperand());
7024 return std::make_pair(Key, SubKey);
7034bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7036 unsigned Opcode0 = S.getOpcode();
7037 unsigned Opcode1 = S.getAltOpcode();
7041 Opcode0, Opcode1, OpcodeMask))
7058 switch (Res.value_or(0)) {
7073 constexpr unsigned NumAltInsts = 3;
7074 unsigned NonInstCnt = 0;
7077 unsigned UndefCnt = 0;
7079 unsigned ExtraShuffleInsts = 0;
7088 return is_contained(Operands.back(), V);
7091 ++ExtraShuffleInsts;
7109 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
7116 if (!Res.second && Res.first->second == 1)
7117 ++ExtraShuffleInsts;
7118 ++Res.first->getSecond();
7120 UniqueOpcodes.
insert(
I->getOpcode());
7121 else if (Res.second)
7124 return none_of(Uniques, [&](
const auto &
P) {
7125 return P.first->hasNUsesOrMore(
P.second + 1) &&
7127 return getTreeEntry(U) || Uniques.contains(U);
7137 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7141BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7144 assert(S.MainOp &&
"Expected instructions with same/alternate opcodes only.");
7151 return TreeEntry::NeedToGather;
7153 unsigned ShuffleOrOp =
7156 switch (ShuffleOrOp) {
7157 case Instruction::PHI: {
7160 return TreeEntry::NeedToGather;
7165 if (Term &&
Term->isTerminator()) {
7167 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7168 return TreeEntry::NeedToGather;
7172 return TreeEntry::Vectorize;
7174 case Instruction::ExtractValue:
7175 case Instruction::ExtractElement: {
7176 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7179 return TreeEntry::NeedToGather;
7180 if (Reuse || !CurrentOrder.empty())
7181 return TreeEntry::Vectorize;
7183 return TreeEntry::NeedToGather;
7185 case Instruction::InsertElement: {
7189 for (
Value *V : VL) {
7192 "Non-constant or undef index?");
7196 return !SourceVectors.contains(V);
7199 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7200 "different source vectors.\n");
7201 return TreeEntry::NeedToGather;
7206 return SourceVectors.contains(V) && !V->hasOneUse();
7209 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7210 "multiple uses.\n");
7211 return TreeEntry::NeedToGather;
7214 return TreeEntry::Vectorize;
7216 case Instruction::Load: {
7225 return TreeEntry::Vectorize;
7227 return TreeEntry::ScatterVectorize;
7229 return TreeEntry::StridedVectorize;
7232 Type *ScalarTy = VL0->getType();
7233 if (
DL->getTypeSizeInBits(ScalarTy) !=
7234 DL->getTypeAllocSizeInBits(ScalarTy))
7235 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7243 return TreeEntry::NeedToGather;
7247 case Instruction::ZExt:
7248 case Instruction::SExt:
7249 case Instruction::FPToUI:
7250 case Instruction::FPToSI:
7251 case Instruction::FPExt:
7252 case Instruction::PtrToInt:
7253 case Instruction::IntToPtr:
7254 case Instruction::SIToFP:
7255 case Instruction::UIToFP:
7256 case Instruction::Trunc:
7257 case Instruction::FPTrunc:
7258 case Instruction::BitCast: {
7259 Type *SrcTy = VL0->getOperand(0)->getType();
7260 for (
Value *V : VL) {
7264 dbgs() <<
"SLP: Gathering casts with different src types.\n");
7265 return TreeEntry::NeedToGather;
7268 return TreeEntry::Vectorize;
7270 case Instruction::ICmp:
7271 case Instruction::FCmp: {
7275 Type *ComparedTy = VL0->getOperand(0)->getType();
7276 for (
Value *V : VL) {
7278 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7279 Cmp->getOperand(0)->getType() != ComparedTy) {
7280 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7281 return TreeEntry::NeedToGather;
7284 return TreeEntry::Vectorize;
7286 case Instruction::Select:
7287 case Instruction::FNeg:
7288 case Instruction::Add:
7289 case Instruction::FAdd:
7290 case Instruction::Sub:
7291 case Instruction::FSub:
7292 case Instruction::Mul:
7293 case Instruction::FMul:
7294 case Instruction::UDiv:
7295 case Instruction::SDiv:
7296 case Instruction::FDiv:
7297 case Instruction::URem:
7298 case Instruction::SRem:
7299 case Instruction::FRem:
7300 case Instruction::Shl:
7301 case Instruction::LShr:
7302 case Instruction::AShr:
7303 case Instruction::And:
7304 case Instruction::Or:
7305 case Instruction::Xor:
7306 case Instruction::Freeze:
7307 return TreeEntry::Vectorize;
7308 case Instruction::GetElementPtr: {
7310 for (
Value *V : VL) {
7314 if (
I->getNumOperands() != 2) {
7315 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7316 return TreeEntry::NeedToGather;
7323 for (
Value *V : VL) {
7327 Type *CurTy =
GEP->getSourceElementType();
7329 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7330 return TreeEntry::NeedToGather;
7335 Type *Ty1 = VL0->getOperand(1)->getType();
7336 for (
Value *V : VL) {
7340 auto *
Op =
I->getOperand(1);
7342 (
Op->getType() != Ty1 &&
7344 Op->getType()->getScalarSizeInBits() >
7345 DL->getIndexSizeInBits(
7346 V->getType()->getPointerAddressSpace())))) {
7348 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7349 return TreeEntry::NeedToGather;
7353 return TreeEntry::Vectorize;
7355 case Instruction::Store: {
7360 if (
DL->getTypeSizeInBits(ScalarTy) !=
7361 DL->getTypeAllocSizeInBits(ScalarTy)) {
7362 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7363 return TreeEntry::NeedToGather;
7367 for (
Value *V : VL) {
7369 if (!
SI->isSimple()) {
7371 return TreeEntry::NeedToGather;
7380 if (CurrentOrder.empty()) {
7381 Ptr0 = PointerOps.
front();
7382 PtrN = PointerOps.
back();
7384 Ptr0 = PointerOps[CurrentOrder.front()];
7385 PtrN = PointerOps[CurrentOrder.back()];
7387 std::optional<int> Dist =
7390 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7391 return TreeEntry::Vectorize;
7395 return TreeEntry::NeedToGather;
7397 case Instruction::Call: {
7411 return TreeEntry::NeedToGather;
7416 for (
unsigned J = 0; J != NumArgs; ++J)
7419 for (
Value *V : VL) {
7426 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
7428 return TreeEntry::NeedToGather;
7432 for (
unsigned J = 0; J != NumArgs; ++J) {
7435 if (ScalarArgs[J] != A1J) {
7437 <<
"SLP: mismatched arguments in call:" << *CI
7438 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7439 return TreeEntry::NeedToGather;
7448 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7449 <<
"!=" << *V <<
'\n');
7450 return TreeEntry::NeedToGather;
7454 return TreeEntry::Vectorize;
7456 case Instruction::ShuffleVector: {
7457 if (!S.isAltShuffle()) {
7460 return TreeEntry::Vectorize;
7463 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
7464 return TreeEntry::NeedToGather;
7469 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
7470 "the whole alt sequence is not profitable.\n");
7471 return TreeEntry::NeedToGather;
7474 return TreeEntry::Vectorize;
7478 return TreeEntry::NeedToGather;
7492 PHIHandler() =
delete;
7494 : DT(DT), Main(Main), Phis(Phis),
7495 Operands(Main->getNumIncomingValues(),
7497 void buildOperands() {
7498 constexpr unsigned FastLimit = 4;
7509 if (
P->getIncomingBlock(
I) == InBB)
7512 Operands[
I][Idx] =
P->getIncomingValueForBlock(InBB);
7524 Blocks.try_emplace(InBB).first->second.push_back(
I);
7536 auto It =
Blocks.find(InBB);
7539 Operands[It->second.front()][Idx] =
P->getIncomingValue(
I);
7542 for (
const auto &
P :
Blocks) {
7543 if (
P.getSecond().size() <= 1)
7545 unsigned BasicI =
P.getSecond().front();
7546 for (
unsigned I :
ArrayRef(
P.getSecond()).drop_front()) {
7548 [&](
const auto &
Data) {
7549 return !
Data.value() ||
7552 "Expected empty operands list.");
7562 const EdgeInfo &UserTreeIdx) {
7568 auto TryToFindDuplicates = [&](
const InstructionsState &S,
7569 bool DoNotFail =
false) {
7572 for (
Value *V : VL) {
7579 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
7584 size_t NumUniqueScalarValues = UniqueValues.
size();
7586 *
TTI, UniqueValues.
front()->getType(), NumUniqueScalarValues);
7587 if (NumUniqueScalarValues == VL.size() &&
7589 ReuseShuffleIndices.
clear();
7592 if ((UserTreeIdx.UserTE &&
7593 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
7595 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
7596 "for nodes with padding.\n");
7597 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
7601 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
7602 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
7605 if (DoNotFail && UniquePositions.size() > 1 &&
7614 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
7615 if (PWSz == VL.size()) {
7616 ReuseShuffleIndices.
clear();
7618 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
7619 NonUniqueValueVL.
append(PWSz - UniqueValues.
size(),
7620 UniqueValues.
back());
7621 VL = NonUniqueValueVL;
7626 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
7642 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
7647 if (S.getOpcode()) {
7648 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
7649 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.OpValue <<
".\n");
7650 if (GatheredLoadsEntriesFirst != NoGatheredLoads || !E->isSame(VL)) {
7651 auto It = MultiNodeScalars.
find(S.OpValue);
7652 if (It != MultiNodeScalars.
end()) {
7653 auto *TEIt =
find_if(It->getSecond(),
7654 [&](TreeEntry *ME) { return ME->isSame(VL); });
7655 if (TEIt != It->getSecond().end())
7665 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
7666 if (TryToFindDuplicates(S))
7667 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7668 ReuseShuffleIndices);
7672 Nodes.
insert(getTreeEntry(S.OpValue));
7673 for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.OpValue))
7676 if (
any_of(Nodes, [&](
const TreeEntry *E) {
7677 return all_of(E->Scalars,
7678 [&](
Value *V) { return Values.contains(V); });
7681 if (TryToFindDuplicates(S))
7682 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7683 ReuseShuffleIndices);
7690 E->UserTreeIndices.push_back(UserTreeIdx);
7691 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.OpValue
7710 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
7711 if (TryToFindDuplicates(S))
7712 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7713 ReuseShuffleIndices);
7718 if (S.getOpcode() == Instruction::ExtractElement &&
7721 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
7722 if (TryToFindDuplicates(S))
7723 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7724 ReuseShuffleIndices);
7731 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
7740 auto &&NotProfitableForVectorization = [&S,
this,
7742 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
7751 for (
Value *V : VL) {
7754 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
7758 if ((IsCommutative &&
7759 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
7761 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
7763 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
7769 I2->getOperand(
Op));
7770 if (
static_cast<unsigned>(
count_if(
7771 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
7777 if (IsCommutative) {
7782 I2->getOperand((
Op + 1) % E));
7784 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
7793 bool IsScatterVectorizeUserTE =
7794 UserTreeIdx.UserTE &&
7795 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
7796 bool AreAllSameBlock = S.getOpcode() &&
allSameBlock(VL);
7797 bool AreScatterAllGEPSameBlock =
7810 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
7812 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
7817 NotProfitableForVectorization(VL)) {
7818 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
7819 if (TryToFindDuplicates(S))
7820 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7821 ReuseShuffleIndices);
7826 if (S.getOpcode() && !EphValues.
empty()) {
7827 for (
Value *V : VL) {
7828 if (EphValues.
count(V)) {
7830 <<
") is ephemeral.\n");
7831 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
7841 for (
Value *V : VL) {
7845 if (getTreeEntry(V)) {
7847 <<
") is already in tree.\n");
7848 if (TryToFindDuplicates(S))
7849 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7850 ReuseShuffleIndices);
7856 if (UserIgnoreList && !UserIgnoreList->empty()) {
7857 for (
Value *V : VL) {
7858 if (UserIgnoreList->contains(V)) {
7859 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
7860 if (TryToFindDuplicates(S))
7861 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7862 ReuseShuffleIndices);
7870 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
7873 "Expected pointers only.");
7876 assert(It != VL.end() &&
"Expected at least one GEP.");
7889 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
7894 if (!TryToFindDuplicates(S,
true))
7900 TreeEntry::EntryState State = getScalarsVectorizationState(
7901 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7902 if (State == TreeEntry::NeedToGather) {
7903 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7904 ReuseShuffleIndices);
7908 auto &BSRef = BlocksSchedules[BB];
7910 BSRef = std::make_unique<BlockScheduling>(BB);
7912 BlockScheduling &BS = *BSRef;
7914 std::optional<ScheduleData *> Bundle =
7915 BS.tryScheduleBundle(UniqueValues,
this, S);
7916#ifdef EXPENSIVE_CHECKS
7921 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
7922 assert((!BS.getScheduleData(VL0) ||
7923 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7924 "tryScheduleBundle should cancelScheduling on failure");
7925 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
7926 ReuseShuffleIndices);
7927 NonScheduledFirst.insert(VL.front());
7928 if (S.getOpcode() == Instruction::Load &&
7929 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
7933 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
7935 unsigned ShuffleOrOp = S.isAltShuffle() ?
7937 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
7945 if (S.getOpcode() != Instruction::PHI || S.isAltShuffle())
7950 for (
unsigned I : PHIOps)
7953 switch (ShuffleOrOp) {
7954 case Instruction::PHI: {
7958 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7962 PHIHandler Handler(*DT, PH, VL);
7963 Handler.buildOperands();
7965 TE->setOperand(
I, Handler.getOperands(
I));
7972 case Instruction::ExtractValue:
7973 case Instruction::ExtractElement: {
7974 if (CurrentOrder.empty()) {
7975 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
7978 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
7980 for (
unsigned Idx : CurrentOrder)
7981 dbgs() <<
" " << Idx;
7988 newTreeEntry(VL, Bundle , S, UserTreeIdx,
7989 ReuseShuffleIndices, CurrentOrder);
7993 Op0.
assign(VL.size(), VL0->getOperand(0));
7994 VectorizableTree.back()->setOperand(0, Op0);
7997 case Instruction::InsertElement: {
7998 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8000 auto OrdCompare = [](
const std::pair<int, int> &P1,
8001 const std::pair<int, int> &P2) {
8002 return P1.first > P2.first;
8005 decltype(OrdCompare)>
8006 Indices(OrdCompare);
8007 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8009 Indices.emplace(Idx,
I);
8011 OrdersType CurrentOrder(VL.size(), VL.size());
8012 bool IsIdentity =
true;
8013 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8014 CurrentOrder[Indices.top().second] =
I;
8015 IsIdentity &= Indices.top().second ==
I;
8019 CurrentOrder.clear();
8020 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8024 TE->setOperandsInOrder();
8025 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8028 case Instruction::Load: {
8035 TreeEntry *
TE =
nullptr;
8038 case TreeEntry::Vectorize:
8039 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8040 ReuseShuffleIndices, CurrentOrder);
8041 if (CurrentOrder.empty())
8045 TE->setOperandsInOrder();
8047 case TreeEntry::StridedVectorize:
8049 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8050 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8051 TE->setOperandsInOrder();
8054 case TreeEntry::ScatterVectorize:
8056 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8057 UserTreeIdx, ReuseShuffleIndices);
8058 TE->setOperandsInOrder();
8059 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8060 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of non-consecutive loads.\n");
8062 case TreeEntry::CombinedVectorize:
8063 case TreeEntry::NeedToGather:
8068 case Instruction::ZExt:
8069 case Instruction::SExt:
8070 case Instruction::FPToUI:
8071 case Instruction::FPToSI:
8072 case Instruction::FPExt:
8073 case Instruction::PtrToInt:
8074 case Instruction::IntToPtr:
8075 case Instruction::SIToFP:
8076 case Instruction::UIToFP:
8077 case Instruction::Trunc:
8078 case Instruction::FPTrunc:
8079 case Instruction::BitCast: {
8080 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8081 std::make_pair(std::numeric_limits<unsigned>::min(),
8082 std::numeric_limits<unsigned>::max()));
8083 if (ShuffleOrOp == Instruction::ZExt ||
8084 ShuffleOrOp == Instruction::SExt) {
8085 CastMaxMinBWSizes = std::make_pair(
8091 }
else if (ShuffleOrOp == Instruction::Trunc) {
8092 CastMaxMinBWSizes = std::make_pair(
8099 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8100 ReuseShuffleIndices);
8103 TE->setOperandsInOrder();
8105 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8106 if (ShuffleOrOp == Instruction::Trunc) {
8107 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->Idx);
8108 }
else if (ShuffleOrOp == Instruction::SIToFP ||
8109 ShuffleOrOp == Instruction::UIToFP) {
8110 unsigned NumSignBits =
8114 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8116 if (NumSignBits * 2 >=
8118 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->Idx);
8122 case Instruction::ICmp:
8123 case Instruction::FCmp: {
8126 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8127 ReuseShuffleIndices);
8135 "Commutative Predicate mismatch");
8136 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
8139 for (
Value *V : VL) {
8143 if (
Cmp->getPredicate() != P0)
8145 Left.push_back(LHS);
8146 Right.push_back(RHS);
8153 if (ShuffleOrOp == Instruction::ICmp) {
8154 unsigned NumSignBits0 =
8156 if (NumSignBits0 * 2 >=
8158 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->Idx);
8159 unsigned NumSignBits1 =
8161 if (NumSignBits1 * 2 >=
8163 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->Idx);
8167 case Instruction::Select:
8168 case Instruction::FNeg:
8169 case Instruction::Add:
8170 case Instruction::FAdd:
8171 case Instruction::Sub:
8172 case Instruction::FSub:
8173 case Instruction::Mul:
8174 case Instruction::FMul:
8175 case Instruction::UDiv:
8176 case Instruction::SDiv:
8177 case Instruction::FDiv:
8178 case Instruction::URem:
8179 case Instruction::SRem:
8180 case Instruction::FRem:
8181 case Instruction::Shl:
8182 case Instruction::LShr:
8183 case Instruction::AShr:
8184 case Instruction::And:
8185 case Instruction::Or:
8186 case Instruction::Xor:
8187 case Instruction::Freeze: {
8188 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8189 ReuseShuffleIndices);
8196 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
8204 TE->setOperandsInOrder();
8206 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8209 case Instruction::GetElementPtr: {
8210 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8211 ReuseShuffleIndices);
8215 for (
Value *V : VL) {
8221 Operands.front().push_back(
GEP->getPointerOperand());
8230 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8232 [VL0Ty, IndexIdx](
Value *V) {
8236 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8240 ->getPointerOperandType()
8243 for (
Value *V : VL) {
8247 ConstantInt::get(Ty, 0,
false));
8250 auto *
Op =
I->getOperand(IndexIdx);
8256 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8264 case Instruction::Store: {
8265 bool Consecutive = CurrentOrder.empty();
8268 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8269 ReuseShuffleIndices, CurrentOrder);
8270 TE->setOperandsInOrder();
8271 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8275 LLVM_DEBUG(
dbgs() <<
"SLP: added a vector of jumbled stores.\n");
8278 case Instruction::Call: {
8284 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8285 ReuseShuffleIndices);
8290 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
8298 for (
Value *V : VL) {
8313 TE->setOperandsInOrder();
8321 for (
Value *V : VL) {
8329 case Instruction::ShuffleVector: {
8330 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8331 ReuseShuffleIndices);
8341 reorderInputsAccordingToOpcode(VL,
Left,
Right, *
this);
8348 "Expected different main/alternate predicates.");
8351 for (
Value *V : VL) {
8363 Left.push_back(LHS);
8364 Right.push_back(RHS);
8374 TE->setOperandsInOrder();
8376 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8394 for (
const auto *Ty : ST->elements())
8395 if (Ty != *ST->element_begin())
8397 N *= ST->getNumElements();
8398 EltTy = *ST->element_begin();
8400 N *= AT->getNumElements();
8401 EltTy = AT->getElementType();
8404 N *= VT->getNumElements();
8405 EltTy = VT->getElementType();
8412 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8420 bool ResizeAllowed)
const {
8422 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8429 Value *Vec = E0->getOperand(0);
8431 CurrentOrder.
clear();
8435 if (E0->getOpcode() == Instruction::ExtractValue) {
8447 unsigned E = VL.
size();
8448 if (!ResizeAllowed && NElts != E)
8451 unsigned MinIdx = NElts, MaxIdx = 0;
8456 if (Inst->getOperand(0) != Vec)
8464 const unsigned ExtIdx = *Idx;
8465 if (ExtIdx >= NElts)
8467 Indices[
I] = ExtIdx;
8468 if (MinIdx > ExtIdx)
8470 if (MaxIdx < ExtIdx)
8473 if (MaxIdx - MinIdx + 1 > E)
8475 if (MaxIdx + 1 <= E)
8479 bool ShouldKeepOrder =
true;
8485 CurrentOrder.
assign(E, E);
8486 for (
unsigned I = 0;
I <
E; ++
I) {
8489 const unsigned ExtIdx = Indices[
I] - MinIdx;
8490 if (CurrentOrder[ExtIdx] != E) {
8491 CurrentOrder.
clear();
8494 ShouldKeepOrder &= ExtIdx ==
I;
8495 CurrentOrder[ExtIdx] =
I;
8497 if (ShouldKeepOrder)
8498 CurrentOrder.
clear();
8500 return ShouldKeepOrder;
8503bool BoUpSLP::areAllUsersVectorized(
8505 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
8507 return ScalarToTreeEntry.contains(U) ||
8508 isVectorLikeInstWithConstOps(U) ||
8509 (isa<ExtractElementInst>(U) && MustGather.contains(U));
8513static std::pair<InstructionCost, InstructionCost>
8522 FMF = FPCI->getFastMathFlags();
8526 auto IntrinsicCost =
8533 auto LibCost = IntrinsicCost;
8540 return {IntrinsicCost, LibCost};
8543void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
8547 unsigned Sz = Scalars.size();
8550 if (!ReorderIndices.empty())
8552 for (
unsigned I = 0;
I < Sz; ++
I) {
8554 if (!ReorderIndices.empty())
8557 if (IsAltOp(OpInst)) {
8567 if (!ReuseShuffleIndices.
empty()) {
8569 transform(ReuseShuffleIndices, NewMask.
begin(), [&Mask](
int Idx) {
8570 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
8584 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
8593 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
8594 "CmpInst expected to match either main or alternate predicate or "
8597 return MainP !=
P && MainP != SwappedP;
8604 const auto *Op0 = Ops.
front();
8610 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
8614 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
8617 return CI->getValue().isPowerOf2();
8620 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
8623 return CI->getValue().isNegatedPowerOf2();
8628 if (IsConstant && IsUniform)
8630 else if (IsConstant)
8644class BaseShuffleAnalysis {
8646 Type *ScalarTy =
nullptr;
8648 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
8656 unsigned getVF(
Value *V)
const {
8657 assert(V &&
"V cannot be nullptr");
8659 "V does not have FixedVectorType");
8660 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
8662 unsigned VNumElements =
8664 assert(VNumElements > ScalarTyNumElements &&
8665 "the number of elements of V is not large enough");
8666 assert(VNumElements % ScalarTyNumElements == 0 &&
8667 "the number of elements of V is not a vectorized value");
8668 return VNumElements / ScalarTyNumElements;
8676 int Limit =
Mask.size();
8688 if (Limit % VF == 0 &&
all_of(
seq<int>(0, Limit / VF), [=](
int Idx) {
8704 unsigned VF =
Mask.size();
8706 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
8709 int MaskedIdx =
Mask[ExtMask[
I] % VF];
8750 bool SinglePermute) {
8762 if (isIdentityMask(Mask, SVTy,
false)) {
8763 if (!IdentityOp || !SinglePermute ||
8764 (isIdentityMask(Mask, SVTy,
true) &&
8766 IdentityMask.
size()))) {
8771 IdentityMask.
assign(Mask);
8791 if (SV->isZeroEltSplat()) {
8793 IdentityMask.
assign(Mask);
8795 int LocalVF =
Mask.size();
8798 LocalVF = SVOpTy->getNumElements();
8802 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
8804 ExtMask[Idx] = SV->getMaskValue(
I);
8814 if (!IsOp1Undef && !IsOp2Undef) {
8816 for (
int &
I : Mask) {
8819 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
8826 combineMasks(LocalVF, ShuffleMask, Mask);
8827 Mask.swap(ShuffleMask);
8829 Op = SV->getOperand(0);
8831 Op = SV->getOperand(1);
8834 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
8839 "Expected masks of same sizes.");
8844 Mask.swap(IdentityMask);
8846 return SinglePermute &&
8849 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
8850 Shuffle->isZeroEltSplat() &&
8863 template <
typename T,
typename ShuffleBuilderTy>
8865 ShuffleBuilderTy &Builder) {
8866 assert(V1 &&
"Expected at least one vector value.");
8868 Builder.resizeToMatch(V1, V2);
8869 int VF =
Mask.size();
8871 VF = FTy->getNumElements();
8881 for (
int I = 0, E =
Mask.size();
I < E; ++
I) {
8883 CombinedMask1[
I] =
Mask[
I];
8885 CombinedMask2[
I] =
Mask[
I] - VF;
8892 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
8893 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
8899 for (
auto [Idx,
I] :
enumerate(CombinedMask1)) {
8902 ExtMask1[Idx] = SV1->getMaskValue(
I);
8907 ExtMask1, UseMask::SecondArg);
8909 for (
auto [Idx,
I] :
enumerate(CombinedMask2)) {
8912 ExtMask2[Idx] = SV2->getMaskValue(
I);
8917 ExtMask2, UseMask::SecondArg);
8918 if (SV1->getOperand(0)->getType() ==
8919 SV2->getOperand(0)->getType() &&
8920 SV1->getOperand(0)->getType() != SV1->getType() &&
8923 Op1 = SV1->getOperand(0);
8924 Op2 = SV2->getOperand(0);
8926 int LocalVF = ShuffleMask1.size();
8928 LocalVF = FTy->getNumElements();
8929 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
8930 CombinedMask1.swap(ShuffleMask1);
8932 LocalVF = ShuffleMask2.size();
8934 LocalVF = FTy->getNumElements();
8935 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
8936 CombinedMask2.swap(ShuffleMask2);
8939 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
8940 Builder.resizeToMatch(Op1, Op2);
8943 .getKnownMinValue(),
8946 .getKnownMinValue());
8947 for (
int I = 0, E =
Mask.size();
I < E; ++
I) {
8950 "Expected undefined mask element");
8951 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
8960 return Builder.createIdentity(Op1);
8961 return Builder.createShuffleVector(
8966 return Builder.createPoison(
8969 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
8970 assert(V1 &&
"Expected non-null value after looking through shuffles.");
8973 return Builder.createShuffleVector(V1, NewMask);
8974 return Builder.createIdentity(V1);
8980static std::pair<InstructionCost, InstructionCost>
8991 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9001 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9005 for (
Value *V : Ptrs) {
9015 if (!
Ptr || !
Ptr->hasOneUse())
9019 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9025 TTI::PointersChainInfo::getKnownStride(),
9035 [](
const Value *V) {
9037 return Ptr && !
Ptr->hasAllConstantIndices();
9039 ? TTI::PointersChainInfo::getUnknownStride()
9040 : TTI::PointersChainInfo::getKnownStride();
9047 if (It != Ptrs.
end())
9053 BaseGEP->getPointerOperand(), Indices, VecTy,
9058 return std::make_pair(ScalarCost, VecCost);
9063 BaseGraphSize = VectorizableTree.size();
9072 const InstructionsState &S) {
9076 I2->getOperand(
Op));
9078 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9080 [](
const std::pair<Value *, Value *> &
P) {
9089 TreeEntry &E = *VectorizableTree[Idx];
9096 if (VL.
size() <= 2 ||
9097 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9103 unsigned StartIdx = 0;
9105 for (
unsigned VF = VL.
size() / 2; VF >= MinVF; VF =
bit_ceil(VF) / 2) {
9107 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9111 if (
const TreeEntry *SE = getTreeEntry(Slice.front());
9112 SE || getTreeEntry(Slice.back())) {
9115 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9123 bool IsSplat =
isSplat(Slice);
9124 if (Slices.
empty() || !IsSplat ||
9126 Slice.front()->getType(), VF)),
9129 Slice.front()->getType(), 2 * VF)),
9131 count(Slice, Slice.front()) ==
9136 if (!S.getOpcode() || S.isAltShuffle() || !
allSameBlock(Slice))
9141 if ((!UserIgnoreList || E.Idx != 0) &&
9149 if (S.getOpcode() == Instruction::Load) {
9158 }
else if (S.getOpcode() == Instruction::ExtractElement ||
9162 !CheckOperandsProfitability(
9174 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt) {
9175 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9176 if (StartIdx == Cnt)
9177 StartIdx = Cnt + VF;
9178 if (
End == Cnt + VF)
9181 for (
unsigned Cnt : Slices) {
9184 if (
const TreeEntry *SE = getTreeEntry(Slice.front());
9185 SE || getTreeEntry(Slice.back())) {
9188 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9190 AddCombinedNode(SE->Idx, Cnt);
9193 unsigned PrevSize = VectorizableTree.size();
9194 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9195 if (PrevSize + 1 == VectorizableTree.size() &&
9196 VectorizableTree[PrevSize]->isGather() &&
9197 VectorizableTree[PrevSize]->getOpcode() !=
9198 Instruction::ExtractElement &&
9200 VectorizableTree.pop_back();
9203 AddCombinedNode(PrevSize, Cnt);
9207 switch (E.getOpcode()) {
9208 case Instruction::Load: {
9211 if (E.State != TreeEntry::Vectorize)
9213 Type *ScalarTy = E.getMainOp()->getType();
9229 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9230 false, CommonAlignment,
CostKind, BaseLI);
9231 if (StridedCost < OriginalVecCost)
9234 E.State = TreeEntry::StridedVectorize;
9238 case Instruction::Store: {
9256 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9257 false, CommonAlignment,
CostKind, BaseSI);
9258 if (StridedCost < OriginalVecCost)
9261 E.State = TreeEntry::StridedVectorize;
9265 case Instruction::Select: {
9266 if (E.State != TreeEntry::Vectorize)
9272 E.CombinedOp = TreeEntry::MinMax;
9273 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
9274 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
9275 CondEntry->State == TreeEntry::Vectorize) {
9277 CondEntry->State = TreeEntry::CombinedVectorize;
9287 if (VectorizableTree.size() <= 1 &&
9288 VectorizableTree.front()->getOpcode() == Instruction::Load)
9291 constexpr unsigned SmallTree = 3;
9292 constexpr unsigned SmallVF = 2;
9293 if ((VectorizableTree.size() <= SmallTree &&
9294 VectorizableTree.front()->Scalars.size() == SmallVF) ||
9295 (VectorizableTree.size() <= 2 && UserIgnoreList))
9302 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9305 (E.getOpcode() == Instruction::Load ||
9306 (!E.getOpcode() &&
any_of(E.Scalars,
9308 return isa<LoadInst>(V) &&
9310 !isDeleted(cast<Instruction>(V));
9317 if (!GatheredLoads.
empty())
9318 tryToVectorizeGatheredLoads(GatheredLoads);
9328 bool IsFinalized =
false;
9341 bool SameNodesEstimated =
true;
9344 if (Ty->getScalarType()->isPointerTy()) {
9346 ConstantInt::getAllOnesValue(
9348 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
9349 Ty->getScalarType());
9367 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
9370 count(VL, *It) > 1 &&
9387 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
9401 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
9409 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
9410 unsigned NumParts) {
9411 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
9413 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
9414 auto *EE = dyn_cast<ExtractElementInst>(V);
9417 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
9420 return std::max(Sz, VecTy->getNumElements());
9426 -> std::optional<TTI::ShuffleKind> {
9427 if (NumElts <= EltsPerVector)
9428 return std::nullopt;
9430 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
9432 if (I == PoisonMaskElem)
9434 return std::min(S, I);
9437 int OffsetReg1 = OffsetReg0;
9441 int FirstRegId = -1;
9442 Indices.assign(1, OffsetReg0);
9446 int Idx =
I - OffsetReg0;
9448 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
9451 RegIndices.
insert(RegId);
9452 if (RegIndices.
size() > 2)
9453 return std::nullopt;
9454 if (RegIndices.
size() == 2) {
9456 if (Indices.size() == 1) {
9459 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
9461 if (I == PoisonMaskElem)
9463 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
9464 ((I - OffsetReg0) % NumElts) / EltsPerVector;
9465 if (RegId == FirstRegId)
9467 return std::min(S, I);
9470 Indices.push_back(OffsetReg1 % NumElts);
9472 Idx =
I - OffsetReg1;
9474 I = (Idx % NumElts) % EltsPerVector +
9475 (RegId == FirstRegId ? 0 : EltsPerVector);
9485 if (!ShuffleKinds[Part])
9488 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
9492 std::optional<TTI::ShuffleKind> RegShuffleKind =
9493 CheckPerRegistersShuffle(SubMask, Indices);
9494 if (!RegShuffleKind) {
9497 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
9509 for (
unsigned Idx : Indices) {
9510 assert((Idx + EltsPerVector) <=
alignTo(NumElts, EltsPerVector) &&
9511 "SK_ExtractSubvector index out of range");
9523 if (OriginalCost <
Cost)
9524 Cost = OriginalCost;
9532 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
9534 CommonMask[Idx] = Idx;
9539 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
9541 unsigned SliceSize) {
9542 if (SameNodesEstimated) {
9548 if ((InVectors.
size() == 2 &&
9549 InVectors.
front().get<
const TreeEntry *>() == &E1 &&
9550 InVectors.
back().get<
const TreeEntry *>() == E2) ||
9551 (!E2 && InVectors.
front().get<
const TreeEntry *>() == &E1)) {
9552 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
9555 "Expected all poisoned elements.");
9557 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
9562 Cost += createShuffle(InVectors.
front(),
9563 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
9565 transformMaskAfterShuffle(CommonMask, CommonMask);
9567 SameNodesEstimated =
false;
9568 if (!E2 && InVectors.
size() == 1) {
9569 unsigned VF = E1.getVectorFactor();
9574 const auto *E = InVectors.
front().get<
const TreeEntry *>();
9575 VF = std::max(VF, E->getVectorFactor());
9577 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
9579 CommonMask[Idx] = Mask[Idx] + VF;
9580 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
9581 transformMaskAfterShuffle(CommonMask, CommonMask);
9583 Cost += createShuffle(&E1, E2, Mask);
9584 transformMaskAfterShuffle(CommonMask, Mask);
9588 class ShuffleCostBuilder {
9591 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
9593 return Mask.empty() ||
9594 (VF == Mask.size() &&
9602 ~ShuffleCostBuilder() =
default;
9608 if (isEmptyOrIdentity(Mask, VF))
9617 if (isEmptyOrIdentity(Mask, VF))
9626 void resizeToMatch(
Value *&,
Value *&)
const {}
9636 ShuffleCostBuilder Builder(
TTI);
9639 unsigned CommonVF = Mask.size();
9641 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &
E,
9645 Type *EScalarTy = E.Scalars.front()->getType();
9646 bool IsSigned =
true;
9647 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
9649 IsSigned = It->second.second;
9651 if (EScalarTy != ScalarTy) {
9652 unsigned CastOpcode = Instruction::Trunc;
9653 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
9654 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
9656 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
9668 if (EScalarTy != ScalarTy) {
9670 unsigned CastOpcode = Instruction::Trunc;
9671 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
9672 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
9674 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
9681 if (!V1 && !V2 && !P2.
isNull()) {
9683 const TreeEntry *E = P1.
get<
const TreeEntry *>();
9684 unsigned VF = E->getVectorFactor();
9685 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
9686 CommonVF = std::max(VF, E2->getVectorFactor());
9689 return Idx < 2 * static_cast<int>(CommonVF);
9691 "All elements in mask must be less than 2 * CommonVF.");
9692 if (E->Scalars.size() == E2->Scalars.size()) {
9696 for (
int &Idx : CommonMask) {
9699 if (Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
9701 else if (Idx >=
static_cast<int>(CommonVF))
9702 Idx = (E2Mask.
empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
9706 CommonVF = E->Scalars.size();
9707 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
9708 GetNodeMinBWAffectedCost(*E2, CommonVF);
9710 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
9711 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
9715 }
else if (!V1 && P2.
isNull()) {
9717 const TreeEntry *E = P1.
get<
const TreeEntry *>();
9718 unsigned VF = E->getVectorFactor();
9722 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
9723 "All elements in mask must be less than CommonVF.");
9724 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
9726 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
9727 for (
int &Idx : CommonMask) {
9731 CommonVF = E->Scalars.size();
9733 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
9736 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
9737 CommonVF == CommonMask.
size() &&
9739 [](
const auto &&
P) {
9741 static_cast<unsigned>(
P.value()) !=
P.index();
9749 }
else if (V1 && P2.
isNull()) {
9751 ExtraCost += GetValueMinBWAffectedCost(V1);
9752 CommonVF = getVF(V1);
9755 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
9756 "All elements in mask must be less than CommonVF.");
9757 }
else if (V1 && !V2) {
9759 unsigned VF = getVF(V1);
9760 const TreeEntry *E2 = P2.
get<
const TreeEntry *>();
9761 CommonVF = std::max(VF, E2->getVectorFactor());
9764 return Idx < 2 * static_cast<int>(CommonVF);
9766 "All elements in mask must be less than 2 * CommonVF.");
9767 if (E2->Scalars.size() == VF && VF != CommonVF) {
9769 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
9770 for (
int &Idx : CommonMask) {
9773 if (Idx >=
static_cast<int>(CommonVF))
9774 Idx = E2Mask[Idx - CommonVF] + VF;
9778 ExtraCost += GetValueMinBWAffectedCost(V1);
9780 ExtraCost += GetNodeMinBWAffectedCost(
9781 *E2, std::min(CommonVF, E2->getVectorFactor()));
9783 }
else if (!V1 && V2) {
9785 unsigned VF = getVF(V2);
9786 const TreeEntry *E1 = P1.
get<
const TreeEntry *>();
9787 CommonVF = std::max(VF, E1->getVectorFactor());
9790 return Idx < 2 * static_cast<int>(CommonVF);
9792 "All elements in mask must be less than 2 * CommonVF.");
9793 if (E1->Scalars.size() == VF && VF != CommonVF) {
9795 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
9796 for (
int &Idx : CommonMask) {
9799 if (Idx >=
static_cast<int>(CommonVF))
9800 Idx = E1Mask[Idx - CommonVF] + VF;
9806 ExtraCost += GetNodeMinBWAffectedCost(
9807 *E1, std::min(CommonVF, E1->getVectorFactor()));
9809 ExtraCost += GetValueMinBWAffectedCost(V2);
9812 assert(V1 && V2 &&
"Expected both vectors.");
9813 unsigned VF = getVF(V1);
9814 CommonVF = std::max(VF, getVF(V2));
9817 return Idx < 2 * static_cast<int>(CommonVF);
9819 "All elements in mask must be less than 2 * CommonVF.");
9821 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
9822 if (V1->
getType() != V2->getType()) {
9839 if (InVectors.
size() == 2)
9841 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
9842 V1, V2, CommonMask, Builder);
9849 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
9850 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
9851 CheckedExtracts(CheckedExtracts) {}
9853 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
9854 unsigned NumParts,
bool &UseVecBaseAsInput) {
9855 UseVecBaseAsInput =
false;
9858 Value *VecBase =
nullptr;
9862 bool PrevNodeFound =
any_of(
9863 ArrayRef(R.VectorizableTree).take_front(E->Idx),
9864 [&](
const std::unique_ptr<TreeEntry> &TE) {
9865 return ((!TE->isAltShuffle() &&
9866 TE->getOpcode() == Instruction::ExtractElement) ||
9868 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
9869 return VL.size() > Data.index() &&
9870 (Mask[Data.index()] == PoisonMaskElem ||
9871 isa<UndefValue>(VL[Data.index()]) ||
9872 Data.value() == VL[Data.index()]);
9879 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
9892 VecBase = EE->getVectorOperand();
9893 UniqueBases.
insert(VecBase);
9894 const TreeEntry *VE = R.getTreeEntry(V);
9895 if (!CheckedExtracts.
insert(V).second ||
9899 return isa<GetElementPtrInst>(U) &&
9900 !R.areAllUsersVectorized(cast<Instruction>(U),
9908 unsigned Idx = *EEIdx;
9910 if (EE->hasOneUse() || !PrevNodeFound) {
9918 EE->getVectorOperandType(), Idx);
9921 Ext->getOpcode(), Ext->getType(), EE->getType(),
9937 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
9940 transformMaskAfterShuffle(CommonMask, CommonMask);
9941 SameNodesEstimated =
false;
9942 if (NumParts != 1 && UniqueBases.
size() != 1) {
9943 UseVecBaseAsInput =
true;
9951 std::optional<InstructionCost>
9955 return std::nullopt;
9961 return Idx < static_cast<int>(E1.getVectorFactor());
9963 "Expected single vector shuffle mask.");
9967 if (InVectors.
empty()) {
9968 CommonMask.
assign(Mask.begin(), Mask.end());
9969 InVectors.
assign({&E1, &E2});
9972 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
9975 if (NumParts == 0 || NumParts >= Mask.size() ||
9976 MaskVecTy->getNumElements() % NumParts != 0 ||
9978 MaskVecTy->getNumElements() / NumParts))
9983 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9984 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
9987 if (InVectors.
empty()) {
9988 CommonMask.
assign(Mask.begin(), Mask.end());
9989 InVectors.
assign(1, &E1);
9992 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
9995 if (NumParts == 0 || NumParts >= Mask.size() ||
9996 MaskVecTy->getNumElements() % NumParts != 0 ||
9998 MaskVecTy->getNumElements() / NumParts))
10003 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10004 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10005 if (!SameNodesEstimated && InVectors.
size() == 1)
10019 .get<
const TreeEntry *>()
10020 ->Scalars[
P.index()]);
10021 return EI->getVectorOperand() == V1 ||
10022 EI->getVectorOperand() == V2;
10024 "Expected extractelement vectors.");
10028 if (InVectors.
empty()) {
10030 "Expected empty input mask/vectors.");
10031 CommonMask.
assign(Mask.begin(), Mask.end());
10032 InVectors.
assign(1, V1);
10038 InVectors.
front().is<
const TreeEntry *>() && !CommonMask.
empty() &&
10042 .get<
const TreeEntry *>()
10043 ->Scalars[
P.index()];
10045 return P.value() == Mask[
P.index()] ||
10050 return EI->getVectorOperand() == V1;
10052 "Expected only tree entry for extractelement vectors.");
10056 "Expected only tree entries from extracts/reused buildvectors.");
10057 unsigned VF = getVF(V1);
10058 if (InVectors.
size() == 2) {
10059 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10060 transformMaskAfterShuffle(CommonMask, CommonMask);
10061 VF = std::max<unsigned>(VF, CommonMask.
size());
10062 }
else if (
const auto *InTE =
10063 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10064 VF = std::max(VF, InTE->getVectorFactor());
10068 ->getNumElements());
10071 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
10073 CommonMask[Idx] = Mask[Idx] + VF;
10076 Value *Root =
nullptr) {
10077 Cost += getBuildVectorCost(VL, Root);
10081 unsigned VF = VL.
size();
10083 VF = std::min(VF, MaskVF);
10106 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10109 Vals.
swap(NewVals);
10122 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10125 IsFinalized =
true;
10128 if (InVectors.
size() == 2)
10129 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10131 Cost += createShuffle(Vec,
nullptr, CommonMask);
10132 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
10134 CommonMask[Idx] = Idx;
10136 "Expected vector length for the final value before action.");
10138 Action(V, CommonMask);
10139 InVectors.
front() = V;
10141 if (!SubVectors.empty()) {
10143 if (InVectors.
size() == 2)
10144 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10146 Cost += createShuffle(Vec,
nullptr, CommonMask);
10147 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
10149 CommonMask[Idx] = Idx;
10150 for (
auto [E, Idx] : SubVectors) {
10151 Type *EScalarTy = E->Scalars.front()->getType();
10152 bool IsSigned =
true;
10153 if (
auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10156 IsSigned = It->second.second;
10158 if (ScalarTy != EScalarTy) {
10159 unsigned CastOpcode = Instruction::Trunc;
10160 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10161 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10163 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10173 if (!CommonMask.
empty()) {
10174 std::iota(std::next(CommonMask.
begin(), Idx),
10175 std::next(CommonMask.
begin(), Idx + E->getVectorFactor()),
10182 if (CommonMask.
empty()) {
10183 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
10187 createShuffle(InVectors.
front(),
10188 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
10194 "Shuffle construction must be finalized.");
10198const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
10199 unsigned Idx)
const {
10200 if (
const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
10203 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
10204 return TE->isGather() &&
10205 find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
10206 return EI.EdgeIdx == Idx && EI.UserTE == E;
10207 }) != TE->UserTreeIndices.end();
10209 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
10214 if (TE.State == TreeEntry::ScatterVectorize ||
10215 TE.State == TreeEntry::StridedVectorize)
10217 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
10218 !TE.isAltShuffle()) {
10219 if (TE.ReorderIndices.empty())
10265 auto It = MinBWs.find(E);
10266 Type *OrigScalarTy = ScalarTy;
10267 if (It != MinBWs.end()) {
10274 unsigned EntryVF = E->getVectorFactor();
10277 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
10278 if (E->isGather()) {
10284 ScalarTy = VL.
front()->getType();
10285 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
10286 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
10291 if (!E->ReorderIndices.empty() &&
10292 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
10294 if (E->getOpcode() == Instruction::Store) {
10296 NewMask.
resize(E->ReorderIndices.size());
10297 copy(E->ReorderIndices, NewMask.
begin());
10303 if (NeedToShuffleReuses)
10304 ::addMask(Mask, E->ReuseShuffleIndices);
10308 assert((E->State == TreeEntry::Vectorize ||
10309 E->State == TreeEntry::ScatterVectorize ||
10310 E->State == TreeEntry::StridedVectorize) &&
10311 "Unhandled state");
10312 assert(E->getOpcode() &&
10314 (E->getOpcode() == Instruction::GetElementPtr &&
10315 E->getMainOp()->getType()->isPointerTy())) &&
10318 unsigned ShuffleOrOp =
10320 if (E->CombinedOp != TreeEntry::NotCombinedOp)
10321 ShuffleOrOp = E->CombinedOp;
10323 const unsigned Sz = UniqueValues.
size();
10325 for (
unsigned I = 0;
I < Sz; ++
I) {
10326 if (getTreeEntry(UniqueValues[
I]) == E)
10328 UsedScalars.set(
I);
10330 auto GetCastContextHint = [&](
Value *V) {
10331 if (
const TreeEntry *OpTE = getTreeEntry(V))
10332 return getCastContextHint(*OpTE);
10333 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
10334 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
10347 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
10349 for (
unsigned I = 0;
I < Sz; ++
I) {
10350 if (UsedScalars.test(
I))
10352 ScalarCost += ScalarEltCost(
I);
10361 (E->getOpcode() != Instruction::Load ||
10362 !E->UserTreeIndices.empty())) {
10363 const EdgeInfo &EI =
10364 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
10365 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
10367 if (EI.UserTE->getOpcode() != Instruction::Select ||
10369 auto UserBWIt = MinBWs.find(EI.UserTE);
10370 Type *UserScalarTy =
10371 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
10372 if (UserBWIt != MinBWs.end())
10374 UserBWIt->second.first);
10375 if (ScalarTy != UserScalarTy) {
10376 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
10377 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
10378 unsigned VecOpcode;
10379 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
10380 if (BWSz > SrcBWSz)
10381 VecOpcode = Instruction::Trunc;
10384 It->second.second ? Instruction::SExt : Instruction::ZExt;
10391 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
10392 ScalarCost,
"Calculated costs for Tree"));
10393 return VecCost - ScalarCost;
10398 assert((E->State == TreeEntry::Vectorize ||
10399 E->State == TreeEntry::StridedVectorize) &&
10400 "Entry state expected to be Vectorize or StridedVectorize here.");
10404 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
10405 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
10406 "Calculated GEPs cost for Tree"));
10408 return VecCost - ScalarCost;
10415 Type *CanonicalType = Ty;
10422 {CanonicalType, CanonicalType});
10427 if (VI && SelectOnly) {
10428 assert(!Ty->isVectorTy() &&
"Expected only for scalar type.");
10431 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
10432 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
10433 {TTI::OK_AnyValue, TTI::OP_None}, CI);
10435 return IntrinsicCost;
10437 switch (ShuffleOrOp) {
10438 case Instruction::PHI: {
10442 for (
Value *V : UniqueValues) {
10448 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
10452 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
10454 if (!OpTE->ReuseShuffleIndices.empty())
10455 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
10456 OpTE->Scalars.size());
10459 return CommonCost - ScalarCost;
10461 case Instruction::ExtractValue:
10462 case Instruction::ExtractElement: {
10463 auto GetScalarCost = [&](
unsigned Idx) {
10466 if (ShuffleOrOp == Instruction::ExtractElement) {
10468 SrcVecTy = EE->getVectorOperandType();
10471 Type *AggregateTy = EV->getAggregateOperand()->getType();
10474 NumElts = ATy->getNumElements();
10479 if (
I->hasOneUse()) {
10489 Ext->getOpcode(),
Ext->getType(),
I->getType(),
10497 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
10498 return GetCostDiff(GetScalarCost, GetVectorCost);
10500 case Instruction::InsertElement: {
10501 assert(E->ReuseShuffleIndices.empty() &&
10502 "Unique insertelements only are expected.");
10504 unsigned const NumElts = SrcVecTy->getNumElements();
10505 unsigned const NumScalars = VL.
size();
10511 unsigned OffsetEnd = OffsetBeg;
10512 InsertMask[OffsetBeg] = 0;
10515 if (OffsetBeg > Idx)
10517 else if (OffsetEnd < Idx)
10519 InsertMask[Idx] =
I + 1;
10522 if (NumOfParts > 0 && NumOfParts < NumElts)
10523 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
10524 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
10526 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
10527 unsigned InsertVecSz = std::min<unsigned>(
10529 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
10530 bool IsWholeSubvector =
10531 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
10535 if (OffsetBeg + InsertVecSz > VecSz) {
10538 InsertVecSz = VecSz;
10544 if (!E->ReorderIndices.empty()) {
10549 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
10551 bool IsIdentity =
true;
10553 Mask.swap(PrevMask);
10554 for (
unsigned I = 0;
I < NumScalars; ++
I) {
10556 DemandedElts.
setBit(InsertIdx);
10557 IsIdentity &= InsertIdx - OffsetBeg ==
I;
10558 Mask[InsertIdx - OffsetBeg] =
I;
10560 assert(
Offset < NumElts &&
"Failed to find vector index offset");
10574 InsertVecTy, Mask);
10576 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
10584 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
10585 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
10586 if (InsertVecSz != VecSz) {
10597 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
10606 case Instruction::ZExt:
10607 case Instruction::SExt:
10608 case Instruction::FPToUI:
10609 case Instruction::FPToSI:
10610 case Instruction::FPExt:
10611 case Instruction::PtrToInt:
10612 case Instruction::IntToPtr:
10613 case Instruction::SIToFP:
10614 case Instruction::UIToFP:
10615 case Instruction::Trunc:
10616 case Instruction::FPTrunc:
10617 case Instruction::BitCast: {
10618 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
10621 unsigned Opcode = ShuffleOrOp;
10622 unsigned VecOpcode = Opcode;
10624 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
10626 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
10627 if (SrcIt != MinBWs.end()) {
10628 SrcBWSz = SrcIt->second.first;
10635 if (BWSz == SrcBWSz) {
10636 VecOpcode = Instruction::BitCast;
10637 }
else if (BWSz < SrcBWSz) {
10638 VecOpcode = Instruction::Trunc;
10639 }
else if (It != MinBWs.end()) {
10640 assert(BWSz > SrcBWSz &&
"Invalid cast!");
10641 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10642 }
else if (SrcIt != MinBWs.end()) {
10643 assert(BWSz > SrcBWSz &&
"Invalid cast!");
10645 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
10647 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
10648 !SrcIt->second.second) {
10649 VecOpcode = Instruction::UIToFP;
10660 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
10662 auto *VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
10664 return CommonCost +
10666 VecOpcode == Opcode ? VI :
nullptr);
10668 return GetCostDiff(GetScalarCost, GetVectorCost);
10670 case Instruction::FCmp:
10671 case Instruction::ICmp:
10672 case Instruction::Select: {
10676 match(VL0, MatchCmp))
10682 auto GetScalarCost = [&](
unsigned Idx) {
10689 !
match(VI, MatchCmp)) ||
10690 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
10696 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
10697 CostKind, getOperandInfo(VI->getOperand(0)),
10698 getOperandInfo(VI->getOperand(1)), VI);
10701 ScalarCost = IntrinsicCost;
10710 CostKind, getOperandInfo(E->getOperand(0)),
10711 getOperandInfo(E->getOperand(1)), VL0);
10715 unsigned CondNumElements = CondType->getNumElements();
10717 assert(VecTyNumElements >= CondNumElements &&
10718 VecTyNumElements % CondNumElements == 0 &&
10719 "Cannot vectorize Instruction::Select");
10720 if (CondNumElements != VecTyNumElements) {
10729 return VecCost + CommonCost;
10731 return GetCostDiff(GetScalarCost, GetVectorCost);
10733 case TreeEntry::MinMax: {
10734 auto GetScalarCost = [&](
unsigned Idx) {
10735 return GetMinMaxCost(OrigScalarTy);
10739 return VecCost + CommonCost;
10741 return GetCostDiff(GetScalarCost, GetVectorCost);
10743 case Instruction::FNeg:
10744 case Instruction::Add:
10745 case Instruction::FAdd:
10746 case Instruction::Sub:
10747 case Instruction::FSub:
10748 case Instruction::Mul:
10749 case Instruction::FMul:
10750 case Instruction::UDiv:
10751 case Instruction::SDiv:
10752 case Instruction::FDiv:
10753 case Instruction::URem:
10754 case Instruction::SRem:
10755 case Instruction::FRem:
10756 case Instruction::Shl:
10757 case Instruction::LShr:
10758 case Instruction::AShr:
10759 case Instruction::And:
10760 case Instruction::Or:
10761 case Instruction::Xor: {
10762 auto GetScalarCost = [&](
unsigned Idx) {
10773 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
10778 return CI && CI->getValue().countr_one() >= It->second.first;
10787 Op2Info, {},
nullptr, TLI) +
10790 return GetCostDiff(GetScalarCost, GetVectorCost);
10792 case Instruction::GetElementPtr: {
10793 return CommonCost + GetGEPCostDiff(VL, VL0);
10795 case Instruction::Load: {
10796 auto GetScalarCost = [&](
unsigned Idx) {
10799 VI->getAlign(), VI->getPointerAddressSpace(),
10805 if (E->State == TreeEntry::Vectorize) {
10807 Instruction::Load, VecTy, LI0->getAlign(),
10809 }
else if (E->State == TreeEntry::StridedVectorize) {
10810 Align CommonAlignment =
10813 Instruction::Load, VecTy, LI0->getPointerOperand(),
10814 false, CommonAlignment,
CostKind);
10816 assert(E->State == TreeEntry::ScatterVectorize &&
"Unknown EntryState");
10817 Align CommonAlignment =
10820 Instruction::Load, VecTy, LI0->getPointerOperand(),
10821 false, CommonAlignment,
CostKind);
10823 return VecLdCost + CommonCost;
10829 if (E->State == TreeEntry::ScatterVectorize)
10836 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
10838 case Instruction::Store: {
10839 bool IsReorder = !E->ReorderIndices.empty();
10840 auto GetScalarCost = [=](
unsigned Idx) {
10844 VI->getAlign(), VI->getPointerAddressSpace(),
10852 if (E->State == TreeEntry::StridedVectorize) {
10853 Align CommonAlignment =
10856 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
10857 false, CommonAlignment,
CostKind);
10859 assert(E->State == TreeEntry::Vectorize &&
10860 "Expected either strided or consecutive stores.");
10863 Instruction::Store, VecTy, BaseSI->getAlign(),
10864 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
10866 return VecStCost + CommonCost;
10870 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
10874 return GetCostDiff(GetScalarCost, GetVectorCost) +
10875 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
10877 case Instruction::Call: {
10878 auto GetScalarCost = [&](
unsigned Idx) {
10894 It != MinBWs.end() ? It->second.first : 0);
10896 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
10898 return GetCostDiff(GetScalarCost, GetVectorCost);
10900 case Instruction::ShuffleVector: {
10901 if (!
SLPReVec || E->isAltShuffle())
10902 assert(E->isAltShuffle() &&
10908 "Invalid Shuffle Vector Operand");
10911 auto TryFindNodeWithEqualOperands = [=]() {
10912 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10915 if (
TE->isAltShuffle() &&
10916 ((
TE->getOpcode() == E->getOpcode() &&
10917 TE->getAltOpcode() == E->getAltOpcode()) ||
10918 (
TE->getOpcode() == E->getAltOpcode() &&
10919 TE->getAltOpcode() == E->getOpcode())) &&
10920 TE->hasEqualOperands(*E))
10925 auto GetScalarCost = [&](
unsigned Idx) {
10927 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
10937 if (TryFindNodeWithEqualOperands()) {
10939 dbgs() <<
"SLP: diamond match for alternate node found.\n";
10946 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
10948 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
10951 VecCost = TTIRef.getCmpSelInstrCost(
10952 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
10953 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
10955 VecCost += TTIRef.getCmpSelInstrCost(
10956 E->getOpcode(), VecTy, MaskTy,
10958 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
10961 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
10964 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
10965 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
10967 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
10968 if (SrcIt != MinBWs.end()) {
10969 SrcBWSz = SrcIt->second.first;
10973 if (BWSz <= SrcBWSz) {
10974 if (BWSz < SrcBWSz)
10976 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
10980 <<
"SLP: alternate extension, which should be truncated.\n";
10986 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
10989 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
10993 E->buildAltOpShuffleMask(
10995 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11006 unsigned Opcode0 = E->getOpcode();
11007 unsigned Opcode1 = E->getAltOpcode();
11011 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11013 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11014 return AltVecCost < VecCost ? AltVecCost : VecCost;
11019 if (
SLPReVec && !E->isAltShuffle())
11020 return GetCostDiff(
11025 "Not supported shufflevector usage.");
11027 unsigned SVNumElements =
11029 ->getNumElements();
11030 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11031 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
11036 "Not supported shufflevector usage.");
11039 [[maybe_unused]]
bool isExtractSubvectorMask =
11040 SV->isExtractSubvectorMask(
Index);
11041 assert(isExtractSubvectorMask &&
11042 "Not supported shufflevector usage.");
11043 if (NextIndex !=
Index)
11045 NextIndex += SV->getShuffleMask().size();
11048 return ::getShuffleCost(
11054 return GetCostDiff(GetScalarCost, GetVectorCost);
11056 case Instruction::Freeze:
11063bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
11065 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11067 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11069 return TE->isGather() &&
11071 [
this](
Value *V) { return EphValues.contains(V); }) &&
11073 TE->Scalars.size() < Limit ||
11074 ((
TE->getOpcode() == Instruction::ExtractElement ||
11077 (
TE->getOpcode() == Instruction::Load && !
TE->isAltShuffle()) ||
11082 if (VectorizableTree.size() == 1 &&
11083 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11084 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11086 AreVectorizableGathers(VectorizableTree[0].
get(),
11087 VectorizableTree[0]->Scalars.size()) &&
11088 VectorizableTree[0]->getVectorFactor() > 2)))
11091 if (VectorizableTree.size() != 2)
11099 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11100 AreVectorizableGathers(VectorizableTree[1].
get(),
11101 VectorizableTree[0]->Scalars.size()))
11105 if (VectorizableTree[0]->
isGather() ||
11106 (VectorizableTree[1]->
isGather() &&
11107 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11108 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11116 bool MustMatchOrInst) {
11120 Value *ZextLoad = Root;
11121 const APInt *ShAmtC;
11122 bool FoundOr =
false;
11126 ShAmtC->
urem(8) == 0))) {
11128 ZextLoad = BinOp->getOperand(0);
11129 if (BinOp->getOpcode() == Instruction::Or)
11134 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
11141 Type *SrcTy = Load->getType();
11142 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
11148 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
11158 unsigned NumElts = VectorizableTree[0]->Scalars.size();
11159 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
11167 unsigned NumElts = Stores.
size();
11168 for (
Value *Scalar : Stores) {
11179 if (VectorizableTree.size() == 2 &&
11181 VectorizableTree[1]->isGather() &&
11182 (VectorizableTree[1]->getVectorFactor() <= 2 ||
11183 !(
isSplat(VectorizableTree[1]->Scalars) ||
11191 constexpr int Limit = 4;
11193 !VectorizableTree.empty() &&
11194 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11195 return (TE->isGather() &&
11196 TE->getOpcode() != Instruction::ExtractElement &&
11198 TE->getOpcode() == Instruction::PHI;
11209 if (isFullyVectorizableTinyTree(ForReduction))
11214 bool IsAllowedSingleBVNode =
11215 VectorizableTree.size() > 1 ||
11216 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
11217 !VectorizableTree.front()->isAltShuffle() &&
11218 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
11219 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
11221 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11222 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
11223 return isa<ExtractElementInst, UndefValue>(V) ||
11224 (IsAllowedSingleBVNode &&
11225 !V->hasNUsesOrMore(UsesLimit) &&
11226 any_of(V->users(), IsaPred<InsertElementInst>));
11231 assert(VectorizableTree.empty()
11232 ? ExternalUses.empty()
11233 :
true &&
"We shouldn't have any external users");
11245 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
11258 for (
const auto &TEPtr : VectorizableTree) {
11259 if (TEPtr->State != TreeEntry::Vectorize)
11267 auto *NodeA = DT->
getNode(
A->getParent());
11268 auto *NodeB = DT->
getNode(
B->getParent());
11269 assert(NodeA &&
"Should only process reachable instructions");
11270 assert(NodeB &&
"Should only process reachable instructions");
11271 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11272 "Different nodes should have different DFS numbers");
11273 if (NodeA != NodeB)
11274 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
11275 return B->comesBefore(
A);
11285 LiveValues.
erase(PrevInst);
11286 for (
auto &J : PrevInst->
operands()) {
11292 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
11293 for (
auto *
X : LiveValues)
11294 dbgs() <<
" " <<
X->getName();
11295 dbgs() <<
", Looking at ";
11300 unsigned NumCalls = 0;
11304 while (InstIt != PrevInstIt) {
11305 if (PrevInstIt == PrevInst->
getParent()->rend()) {
11306 PrevInstIt = Inst->getParent()->rbegin();
11312 if (
II->isAssumeLikeIntrinsic())
11316 for (
auto &ArgOp :
II->args())
11319 FMF = FPMO->getFastMathFlags();
11326 if (IntrCost < CallCost)
11333 if (
isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
11334 &*PrevInstIt != PrevInst)
11342 for (
auto *
II : LiveValues) {
11343 auto *ScalarTy =
II->getType();
11345 ScalarTy = VectorTy->getElementType();
11363 const auto *I1 = IE1;
11364 const auto *I2 = IE2;
11376 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
11379 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
11382 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
11389struct ValueSelect {
11390 template <
typename U>
11391 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
11394 template <
typename U>
11395 static std::enable_if_t<!std::is_same_v<Value *, U>, U>
get(
Value *) {
11413template <
typename T>
11419 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
11421 auto VMIt = std::next(ShuffleMask.begin());
11424 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
11426 if (!IsBaseUndef.
all()) {
11428 std::pair<T *, bool> Res =
11429 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
11431 for (
unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
11435 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
11437 auto *V = ValueSelect::get<T *>(
Base);
11439 assert((!V || GetVF(V) == Mask.size()) &&
11440 "Expected base vector of VF number of elements.");
11441 Prev = Action(Mask, {
nullptr, Res.first});
11442 }
else if (ShuffleMask.size() == 1) {
11445 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
11451 Prev = Action(Mask, {ShuffleMask.begin()->first});
11455 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
11456 unsigned Vec2VF = GetVF(VMIt->first);
11457 if (Vec1VF == Vec2VF) {
11461 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
11464 Mask[
I] = SecMask[
I] + Vec1VF;
11467 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
11470 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
11472 std::pair<T *, bool> Res2 =
11473 ResizeAction(VMIt->first, VMIt->second,
false);
11475 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
11482 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
11485 Prev = Action(Mask, {Res1.first, Res2.first});
11487 VMIt = std::next(VMIt);
11489 bool IsBaseNotUndef = !IsBaseUndef.
all();
11490 (void)IsBaseNotUndef;
11492 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
11494 std::pair<T *, bool> Res =
11495 ResizeAction(VMIt->first, VMIt->second,
false);
11497 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
11500 "Multiple uses of scalars.");
11501 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
11506 Prev = Action(Mask, {Prev, Res.first});
11514template <
typename T>
struct ShuffledInsertData {
11525 << VectorizableTree.size() <<
".\n");
11527 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
11530 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
11531 TreeEntry &TE = *VectorizableTree[
I];
11534 if (TE.State == TreeEntry::CombinedVectorize) {
11536 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
11537 << *TE.Scalars[0] <<
".\n";
11538 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
11541 if (TE.isGather()) {
11542 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
11543 E && E->getVectorFactor() == TE.getVectorFactor() &&
11544 E->isSame(TE.Scalars)) {
11549 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
11556 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
11557 "Expected gather nodes with users only.");
11563 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
11572 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
11574 for (ExternalUser &EU : ExternalUses) {
11579 if (EphValues.
count(EU.User) ||
11587 !ExtractCostCalculated.
insert(EU.Scalar).second)
11599 if (!UsedInserts.
insert(VU).second)
11603 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
11606 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
11611 Value *Op0 =
II->getOperand(0);
11612 if (getTreeEntry(
II) && !getTreeEntry(Op0))
11618 if (It == ShuffledInserts.
end()) {
11620 Data.InsertElements.emplace_back(VU);
11622 VecId = ShuffledInserts.
size() - 1;
11623 auto It = MinBWs.find(ScalarTE);
11624 if (It != MinBWs.end() &&
11626 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
11628 unsigned BWSz = It->second.first;
11629 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
11630 unsigned VecOpcode;
11631 if (DstBWSz < BWSz)
11632 VecOpcode = Instruction::Trunc;
11635 It->second.second ? Instruction::SExt : Instruction::ZExt;
11640 FTy->getNumElements()),
11643 <<
" for extending externally used vector with "
11644 "non-equal minimum bitwidth.\n");
11649 It->InsertElements.front() = VU;
11650 VecId = std::distance(ShuffledInserts.
begin(), It);
11652 int InIdx = *InsertIdx;
11654 ShuffledInserts[VecId].ValueMasks[ScalarTE];
11657 Mask[InIdx] = EU.Lane;
11658 DemandedElts[VecId].setBit(InIdx);
11669 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
11670 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
11671 auto It = MinBWs.find(Entry);
11672 if (It != MinBWs.end()) {
11675 It->second.second ? Instruction::SExt : Instruction::ZExt;
11684 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
11685 Entry->getOpcode() == Instruction::Load) {
11687 auto IsPhiInLoop = [&](
const ExternalUser &U) {
11690 const Loop *L = LI->getLoopFor(Phi->getParent());
11691 return L && (Phi->getParent() ==
I->getParent() ||
11692 L == LI->getLoopFor(
I->getParent()));
11696 if (!ValueToExtUses) {
11697 ValueToExtUses.emplace();
11700 if (IsPhiInLoop(
P.value()))
11703 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
11709 bool CanBeUsedAsScalar =
all_of(Inst->operands(), [&](
Value *V) {
11710 if (!getTreeEntry(V)) {
11714 if (auto *EE = dyn_cast<ExtractElementInst>(V))
11715 return !EE->hasOneUse() || !MustGather.contains(EE);
11718 return ValueToExtUses->contains(V);
11720 if (CanBeUsedAsScalar) {
11722 bool KeepScalar = ScalarCost <= ExtraCost;
11726 bool IsProfitablePHIUser =
11728 VectorizableTree.front()->Scalars.size() > 2)) &&
11729 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
11733 auto *PHIUser = dyn_cast<PHINode>(U);
11734 return (!PHIUser ||
11735 PHIUser->getParent() !=
11737 VectorizableTree.front()->getMainOp())
11742 return ValueToExtUses->contains(V);
11744 if (IsProfitablePHIUser) {
11748 (GatheredLoadsEntriesFirst == NoGatheredLoads ||
11749 Entry->Idx < GatheredLoadsEntriesFirst)) {
11750 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
11751 return ValueToExtUses->contains(V);
11753 auto It = ExtractsCount.
find(Entry);
11754 if (It != ExtractsCount.
end()) {
11755 assert(ScalarUsesCount >= It->getSecond().size() &&
11756 "Expected total number of external uses not less than "
11757 "number of scalar uses.");
11758 ScalarUsesCount -= It->getSecond().size();
11763 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
11766 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
11768 auto It = ValueToExtUses->find(V);
11769 if (It != ValueToExtUses->end()) {
11771 ExternalUses[It->second].User = nullptr;
11774 ExtraCost = ScalarCost;
11775 if (!IsPhiInLoop(EU))
11776 ExtractsCount[Entry].insert(Inst);
11781 ExtractCost += ExtraCost;
11784 if (!VectorizedVals.
empty()) {
11785 const TreeEntry &Root = *VectorizableTree.front();
11786 auto BWIt = MinBWs.find(&Root);
11787 if (BWIt != MinBWs.end()) {
11788 Type *DstTy = Root.Scalars.front()->getType();
11789 unsigned OriginalSz =
DL->getTypeSizeInBits(DstTy);
11791 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
11792 if (OriginalSz != SrcSz) {
11793 unsigned Opcode = Instruction::Trunc;
11794 if (OriginalSz > SrcSz)
11795 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
11805 Cost += SpillCost + ExtractCost;
11809 unsigned VF =
Mask.size();
11810 unsigned VecVF =
TE->getVectorFactor();
11812 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
11815 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
11821 dbgs() <<
"SLP: Adding cost " <<
C
11822 <<
" for final shuffle of insertelement external users.\n";
11823 TE->dump();
dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
11825 return std::make_pair(TE,
true);
11827 return std::make_pair(TE,
false);
11830 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
11831 Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
11832 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
11836 assert((TEs.size() == 1 || TEs.size() == 2) &&
11837 "Expected exactly 1 or 2 tree entries.");
11838 if (TEs.size() == 1) {
11840 VF = TEs.front()->getVectorFactor();
11841 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
11845 (
Data.index() < VF &&
11846 static_cast<int>(
Data.index()) ==
Data.value());
11851 <<
" for final shuffle of insertelement "
11852 "external users.\n";
11853 TEs.front()->
dump();
11854 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
11860 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
11861 VF = TEs.front()->getVectorFactor();
11865 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
11869 <<
" for final shuffle of vector node and external "
11870 "insertelement users.\n";
11871 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
11872 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
11880 [](
const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
11881 EstimateShufflesCost);
11884 ShuffledInserts[
I].InsertElements.front()->getType()),
11887 Cost -= InsertCost;
11891 if (ReductionBitWidth != 0) {
11892 assert(UserIgnoreList &&
"Expected reduction tree.");
11893 const TreeEntry &E = *VectorizableTree.front();
11894 auto It = MinBWs.find(&E);
11895 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
11896 unsigned SrcSize = It->second.first;
11897 unsigned DstSize = ReductionBitWidth;
11898 unsigned Opcode = Instruction::Trunc;
11899 if (SrcSize < DstSize)
11900 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11902 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
11904 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
11907 switch (E.getOpcode()) {
11908 case Instruction::SExt:
11909 case Instruction::ZExt:
11910 case Instruction::Trunc: {
11911 const TreeEntry *OpTE = getOperandEntry(&E, 0);
11912 CCH = getCastContextHint(*OpTE);
11922 <<
" for final resize for reduction from " << SrcVecTy
11923 <<
" to " << DstVecTy <<
"\n";
11924 dbgs() <<
"SLP: Current total cost = " << Cost <<
"\n");
11932 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
11933 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
11934 <<
"SLP: Total Cost = " << Cost <<
".\n";
11938 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
11949std::optional<TTI::ShuffleKind>
11950BoUpSLP::tryToGatherSingleRegisterExtractElements(
11956 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
11977 ExtractMask.reset(*Idx);
11982 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
11987 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
11988 return P1.second.size() > P2.second.size();
11991 const int UndefSz = UndefVectorExtracts.
size();
11992 unsigned SingleMax = 0;
11993 unsigned PairMax = 0;
11994 if (!Vectors.
empty()) {
11995 SingleMax = Vectors.
front().second.size() + UndefSz;
11996 if (Vectors.
size() > 1) {
11997 auto *ItNext = std::next(Vectors.
begin());
11998 PairMax = SingleMax + ItNext->second.size();
12001 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12002 return std::nullopt;
12008 if (SingleMax >= PairMax && SingleMax) {
12009 for (
int Idx : Vectors.
front().second)
12010 std::swap(GatheredExtracts[Idx], VL[Idx]);
12011 }
else if (!Vectors.
empty()) {
12012 for (
unsigned Idx : {0, 1})
12013 for (
int Idx : Vectors[Idx].second)
12014 std::swap(GatheredExtracts[Idx], VL[Idx]);
12017 for (
int Idx : UndefVectorExtracts)
12018 std::swap(GatheredExtracts[Idx], VL[Idx]);
12021 std::optional<TTI::ShuffleKind> Res =
12027 return std::nullopt;
12031 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
12054 unsigned NumParts)
const {
12055 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
12065 std::optional<TTI::ShuffleKind> Res =
12066 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
12067 ShufflesRes[Part] = Res;
12068 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
12070 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
12071 return Res.has_value();
12073 ShufflesRes.clear();
12074 return ShufflesRes;
12077std::optional<TargetTransformInfo::ShuffleKind>
12078BoUpSLP::isGatherShuffledSingleRegisterEntry(
12084 const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
12085 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
12086 :
TE->UserTreeIndices.
front();
12087 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
12092 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
12095 TEInsertBlock = TEInsertPt->
getParent();
12098 return std::nullopt;
12099 auto *NodeUI = DT->
getNode(TEInsertBlock);
12100 assert(NodeUI &&
"Should only process reachable instructions");
12102 auto CheckOrdering = [&](
const Instruction *InsertPt) {
12116 auto *NodeEUI = DT->
getNode(InsertBlock);
12119 assert((NodeUI == NodeEUI) ==
12120 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
12121 "Different nodes should have different DFS numbers");
12123 if (TEInsertPt->
getParent() != InsertBlock &&
12126 if (TEInsertPt->
getParent() == InsertBlock &&
12140 for (
Value *V : VL) {
12145 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
12149 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
12150 "Must contain at least single gathered value.");
12151 assert(TEPtr->UserTreeIndices.size() == 1 &&
12152 "Expected only single user of a gather node.");
12153 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
12158 : &getLastInstructionInBundle(UseEI.UserTE);
12159 if (TEInsertPt == InsertPt) {
12163 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
12167 if (TEUseEI.UserTE != UseEI.UserTE &&
12168 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
12174 if ((TEInsertBlock != InsertPt->
getParent() ||
12175 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
12176 !CheckOrdering(InsertPt))
12180 if (
const TreeEntry *VTE = getTreeEntry(V)) {
12181 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst) {
12182 if (VTE->State != TreeEntry::Vectorize) {
12183 auto It = MultiNodeScalars.
find(V);
12184 if (It == MultiNodeScalars.
end())
12186 VTE = *It->getSecond().begin();
12188 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
12189 return MTE->State == TreeEntry::Vectorize;
12191 if (MIt == It->getSecond().end())
12196 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
12197 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
12201 if (VToTEs.
empty())
12203 if (UsedTEs.
empty()) {
12217 if (!VToTEs.
empty()) {
12223 VToTEs = SavedVToTEs;
12228 if (Idx == UsedTEs.
size()) {
12232 if (UsedTEs.
size() == 2)
12234 UsedTEs.push_back(SavedVToTEs);
12235 Idx = UsedTEs.
size() - 1;
12241 if (UsedTEs.
empty()) {
12243 return std::nullopt;
12247 if (UsedTEs.
size() == 1) {
12250 UsedTEs.front().
end());
12251 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
12252 return TE1->Idx < TE2->Idx;
12255 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
12256 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
12258 if (It != FirstEntries.end() &&
12259 ((*It)->getVectorFactor() == VL.size() ||
12260 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
12261 TE->ReuseShuffleIndices.size() == VL.size() &&
12262 (*It)->isSame(
TE->Scalars)))) {
12263 Entries.push_back(*It);
12264 if ((*It)->getVectorFactor() == VL.size()) {
12265 std::iota(std::next(
Mask.begin(), Part * VL.size()),
12266 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
12272 for (
int I = 0, Sz = VL.size();
I < Sz; ++
I)
12279 Entries.push_back(FirstEntries.front());
12282 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
12285 for (
const TreeEntry *TE : UsedTEs.front()) {
12286 unsigned VF =
TE->getVectorFactor();
12287 auto It = VFToTE.
find(VF);
12288 if (It != VFToTE.
end()) {
12289 if (It->second->Idx >
TE->Idx)
12290 It->getSecond() =
TE;
12297 UsedTEs.back().
end());
12298 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
12299 return TE1->Idx < TE2->Idx;
12301 for (
const TreeEntry *TE : SecondEntries) {
12302 auto It = VFToTE.
find(
TE->getVectorFactor());
12303 if (It != VFToTE.
end()) {
12305 Entries.push_back(It->second);
12306 Entries.push_back(TE);
12312 if (Entries.empty()) {
12314 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
12315 return TE1->Idx < TE2->Idx;
12317 Entries.push_back(SecondEntries.front());
12318 VF = std::max(Entries.front()->getVectorFactor(),
12319 Entries.back()->getVectorFactor());
12326 auto AreCompatiblePHIs = [&](
Value *V,
Value *V1) {
12333 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
12335 Value *In1 = PHI1->getIncomingValue(
I);
12350 auto MightBeIgnored = [=](
Value *V) {
12352 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
12354 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
12359 auto NeighborMightBeIgnored = [&](
Value *V,
int Idx) {
12360 Value *V1 = VL[Idx];
12361 bool UsedInSameVTE =
false;
12362 auto It = UsedValuesEntry.
find(V1);
12363 if (It != UsedValuesEntry.
end())
12364 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
12365 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
12374 for (
int I = 0, E = VL.size();
I < E; ++
I) {
12376 auto It = UsedValuesEntry.
find(V);
12377 if (It == UsedValuesEntry.
end())
12383 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
12384 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
12386 unsigned Idx = It->second;
12393 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
12394 if (!UsedIdxs.test(
I))
12400 for (std::pair<unsigned, int> &Pair : EntryLanes)
12401 if (Pair.first ==
I)
12402 Pair.first = TempEntries.
size();
12405 Entries.swap(TempEntries);
12406 if (EntryLanes.size() == Entries.size() &&
12408 .
slice(Part * VL.size(),
12409 std::min<int>(VL.size(),
TE->Scalars.size())))) {
12415 return std::nullopt;
12418 bool IsIdentity = Entries.size() == 1;
12421 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
12422 unsigned Idx = Part * VL.size() + Pair.second;
12425 (ForOrder ? std::distance(
12426 Entries[Pair.first]->Scalars.begin(),
12427 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
12428 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
12429 IsIdentity &=
Mask[Idx] == Pair.second;
12431 switch (Entries.size()) {
12433 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
12437 if (EntryLanes.size() > 2 || VL.size() <= 2)
12445 std::fill(std::next(
Mask.begin(), Part * VL.size()),
12447 return std::nullopt;
12451BoUpSLP::isGatherShuffledEntry(
12455 assert(NumParts > 0 && NumParts < VL.
size() &&
12456 "Expected positive number of registers.");
12459 if (TE == VectorizableTree.front().get() &&
12460 (GatheredLoadsEntriesFirst == NoGatheredLoads ||
12462 [](
const std::unique_ptr<TreeEntry> &TE) {
12463 return !
TE->isGather();
12467 if (
TE->isNonPowOf2Vec())
12470 assert((
TE->UserTreeIndices.size() == 1 ||
12471 TE == VectorizableTree.front().get()) &&
12472 "Expected only single user of the gather node.");
12474 "Number of scalars must be divisible by NumParts.");
12475 if (!
TE->UserTreeIndices.empty() &&
12476 TE->UserTreeIndices.front().UserTE->isGather() &&
12477 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
12478 assert((
TE->Idx == 0 ||
TE->getOpcode() == Instruction::ExtractElement ||
12480 "Expected splat or extractelements only node.");
12489 std::optional<TTI::ShuffleKind> SubRes =
12490 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
12493 SubEntries.
clear();
12496 SubEntries.
front()->getVectorFactor() == VL.
size() &&
12497 (SubEntries.
front()->isSame(
TE->Scalars) ||
12498 SubEntries.
front()->isSame(VL))) {
12500 LocalSubEntries.
swap(SubEntries);
12503 std::iota(
Mask.begin(),
Mask.end(), 0);
12505 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
12508 Entries.emplace_back(1, LocalSubEntries.
front());
12514 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
12522 Type *ScalarTy)
const {
12524 bool DuplicateNonConst =
false;
12532 auto EstimateInsertCost = [&](
unsigned I,
Value *V) {
12533 if (V->getType() != ScalarTy) {
12544 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
12555 EstimateInsertCost(
I, V);
12556 ShuffleMask[
I] =
I;
12560 DuplicateNonConst =
true;
12562 ShuffleMask[
I] = Res.first->second;
12564 if (ForPoisonSrc) {
12572 if (!ShuffledElements[
I])
12582 if (DuplicateNonConst)
12584 VecTy, ShuffleMask);
12596 VLOperands Ops(VL, R);
12599 Left = Ops.getVL(0);
12600 Right = Ops.getVL(1);
12603Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
12604 auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
12610 auto *Front = E->getMainOp();
12612 assert(((GatheredLoadsEntriesFirst != NoGatheredLoads &&
12613 E->getOpcode() == Instruction::Load && E->isGather() &&
12614 E->Idx < GatheredLoadsEntriesFirst) ||
12616 [=](
Value *V) ->
bool {
12617 if (E->getOpcode() == Instruction::GetElementPtr &&
12618 !isa<GetElementPtrInst>(V))
12620 auto *I = cast<Instruction>(V);
12621 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
12622 isVectorLikeInstWithConstOps(I);
12624 "Expected gathered loads or GEPs or instructions from same basic "
12627 auto FindLastInst = [&]() {
12629 for (
Value *V : E->Scalars) {
12633 if (LastInst->
getParent() ==
I->getParent()) {
12638 assert(((E->getOpcode() == Instruction::GetElementPtr &&
12642 (GatheredLoadsEntriesFirst != NoGatheredLoads &&
12643 E->getOpcode() == Instruction::Load && E->isGather() &&
12644 E->Idx < GatheredLoadsEntriesFirst)) &&
12645 "Expected vector-like or non-GEP in GEP node insts only.");
12653 auto *NodeB = DT->
getNode(
I->getParent());
12654 assert(NodeA &&
"Should only process reachable instructions");
12655 assert(NodeB &&
"Should only process reachable instructions");
12656 assert((NodeA == NodeB) ==
12657 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12658 "Different nodes should have different DFS numbers");
12659 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
12666 auto FindFirstInst = [&]() {
12668 for (
Value *V : E->Scalars) {
12672 if (FirstInst->
getParent() ==
I->getParent()) {
12673 if (
I->comesBefore(FirstInst))
12677 assert(((E->getOpcode() == Instruction::GetElementPtr &&
12681 "Expected vector-like or non-GEP in GEP node insts only.");
12689 auto *NodeB = DT->
getNode(
I->getParent());
12690 assert(NodeA &&
"Should only process reachable instructions");
12691 assert(NodeB &&
"Should only process reachable instructions");
12692 assert((NodeA == NodeB) ==
12693 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12694 "Different nodes should have different DFS numbers");
12695 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
12702 if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
12703 E->Idx >= GatheredLoadsEntriesFirst && !E->isGather() &&
12704 E->getOpcode() == Instruction::Load) {
12705 Res = FindFirstInst();
12713 if ((E->getOpcode() == Instruction::GetElementPtr &&
12716 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
12720 return !isVectorLikeInstWithConstOps(V) &&
12721 isUsedOutsideBlock(V);
12723 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
12724 return isa<ExtractElementInst, UndefValue>(V) ||
12725 areAllOperandsNonInsts(V);
12727 Res = FindLastInst();
12729 Res = FindFirstInst();
12737 if (BlocksSchedules.count(BB)) {
12738 Value *V = E->isOneOf(E->Scalars.back());
12741 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
12742 if (Bundle && Bundle->isPartOfBundle())
12743 for (; Bundle; Bundle = Bundle->NextInBundle)
12744 Res = Bundle->Inst;
12766 Res = FindLastInst();
12767 assert(Res &&
"Failed to find last instruction in bundle");
12771void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
12772 auto *Front = E->getMainOp();
12773 Instruction *LastInst = &getLastInstructionInBundle(E);
12774 assert(LastInst &&
"Failed to find last instruction in bundle");
12779 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
12781 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
12785 Builder.SetInsertPoint(
12789 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
12799 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
12802 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
12803 InsertBB = InsertBB->getSinglePredecessor();
12804 return InsertBB && InsertBB == InstBB;
12806 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12808 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
12809 getTreeEntry(Inst) ||
12810 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
12811 PostponedIndices.
insert(
I).second)
12815 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *V,
unsigned Pos,
12818 if (
Scalar->getType() != Ty) {
12820 Ty->isIntOrIntVectorTy() &&
"Expected integer types only.");
12826 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
12829 Scalar = Builder.CreateIntCast(
12836 Vec = InsElt = Builder.CreateInsertVector(
12840 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
12843 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
12848 GatherShuffleExtractSeq.
insert(InsElt);
12852 if (TreeEntry *Entry = getTreeEntry(V)) {
12854 User *UserOp =
nullptr;
12862 unsigned FoundLane = Entry->findLaneForValue(V);
12863 ExternalUses.emplace_back(V, UserOp, FoundLane);
12873 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12892 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
12895 for (
int I : NonConsts)
12896 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
12899 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
12900 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
12938 bool IsFinalized =
false;
12951 class ShuffleIRBuilder {
12964 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
12965 CSEBlocks(CSEBlocks),
DL(
DL) {}
12966 ~ShuffleIRBuilder() =
default;
12969 if (V1->
getType() != V2->getType()) {
12972 "Expected integer vector types only.");
12973 if (V1->
getType() != V2->getType()) {
12978 ->getIntegerBitWidth())
12979 V2 = Builder.CreateIntCast(
12982 V1 = Builder.CreateIntCast(
12986 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
12988 GatherShuffleExtractSeq.
insert(
I);
12989 CSEBlocks.
insert(
I->getParent());
12998 unsigned VF = Mask.size();
13002 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
13004 GatherShuffleExtractSeq.
insert(
I);
13005 CSEBlocks.
insert(
I->getParent());
13009 Value *createIdentity(
Value *V) {
return V; }
13010 Value *createPoison(
Type *Ty,
unsigned VF) {
13015 void resizeToMatch(
Value *&V1,
Value *&V2) {
13016 if (V1->
getType() == V2->getType())
13020 int VF = std::max(V1VF, V2VF);
13021 int MinVF = std::min(V1VF, V2VF);
13023 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
13025 Value *&
Op = MinVF == V1VF ? V1 : V2;
13026 Op = Builder.CreateShuffleVector(
Op, IdentityMask);
13028 GatherShuffleExtractSeq.
insert(
I);
13029 CSEBlocks.
insert(
I->getParent());
13042 assert(V1 &&
"Expected at least one vector value.");
13043 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
13044 R.CSEBlocks, *R.DL);
13045 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
13053 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
13055 CommonMask[Idx] = Idx;
13061 std::optional<bool> IsSigned = std::nullopt) {
13066 return Builder.CreateIntCast(
13073 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
13077 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13078 unsigned NumParts,
bool &UseVecBaseAsInput) {
13079 UseVecBaseAsInput =
false;
13081 Value *VecBase =
nullptr;
13082 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
13087 VecBase = EI->getVectorOperand();
13088 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
13089 VecBase = TE->VectorizedValue;
13090 assert(VecBase &&
"Expected vectorized value.");
13091 UniqueBases.
insert(VecBase);
13094 if (!EI->hasOneUse() || (NumParts != 1 &&
count(E->Scalars, EI) > 1) ||
13096 const TreeEntry *UTE = R.getTreeEntry(U);
13097 return !UTE || R.MultiNodeScalars.contains(U) ||
13098 (isa<GetElementPtrInst>(U) &&
13099 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
13100 count_if(R.VectorizableTree,
13101 [&](const std::unique_ptr<TreeEntry> &TE) {
13102 return any_of(TE->UserTreeIndices,
13103 [&](const EdgeInfo &Edge) {
13104 return Edge.UserTE == UTE;
13106 is_contained(TE->Scalars, EI);
13110 R.eraseInstruction(EI);
13112 if (NumParts == 1 || UniqueBases.
size() == 1) {
13113 assert(VecBase &&
"Expected vectorized value.");
13114 return castToScalarTyElem(VecBase);
13116 UseVecBaseAsInput =
true;
13126 Value *Vec =
nullptr;
13130 unsigned Limit =
getNumElems(E->Scalars.size(), SliceSize, Part);
13134 constexpr int MaxBases = 2;
13136 auto VLMask =
zip(VL, SubMask);
13137 const unsigned VF = std::accumulate(
13138 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
13139 if (std::get<1>(D) == PoisonMaskElem)
13142 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
13143 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
13144 VecOp = TE->VectorizedValue;
13145 assert(VecOp &&
"Expected vectorized value.");
13146 const unsigned Size =
13147 cast<FixedVectorType>(VecOp->getType())->getNumElements();
13148 return std::max(S, Size);
13150 for (
const auto [V,
I] : VLMask) {
13154 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
13155 VecOp = TE->VectorizedValue;
13156 assert(VecOp &&
"Expected vectorized value.");
13157 VecOp = castToScalarTyElem(VecOp);
13158 Bases[
I / VF] = VecOp;
13160 if (!Bases.front())
13163 if (Bases.back()) {
13164 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
13165 TransformToIdentity(SubMask);
13167 SubVec = Bases.front();
13174 Mask.slice(
P * SliceSize,
13177 return all_of(SubMask, [](
int Idx) {
13181 "Expected first part or all previous parts masked.");
13182 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
13187 unsigned SubVecVF =
13189 NewVF = std::max(NewVF, SubVecVF);
13192 for (
int &Idx : SubMask)
13195 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
13196 Vec = createShuffle(Vec, SubVec, VecMask);
13197 TransformToIdentity(VecMask);
13205 std::optional<Value *>
13211 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
13213 return std::nullopt;
13216 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
13217 return Builder.CreateAlignedLoad(
13225 Value *V1 = E1.VectorizedValue;
13227 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
13228 return !isKnownNonNegative(
13229 V, SimplifyQuery(*R.DL));
13231 Value *V2 = E2.VectorizedValue;
13232 if (V2->getType()->isIntOrIntVectorTy())
13233 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
13234 return !isKnownNonNegative(
13235 V, SimplifyQuery(*R.DL));
13242 Value *V1 = E1.VectorizedValue;
13244 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
13245 return !isKnownNonNegative(
13246 V, SimplifyQuery(*R.DL));
13252 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
13255 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
13256 V1 = castToScalarTyElem(V1);
13257 V2 = castToScalarTyElem(V2);
13258 if (InVectors.
empty()) {
13261 CommonMask.
assign(Mask.begin(), Mask.end());
13265 if (InVectors.
size() == 2) {
13266 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
13267 transformMaskAfterShuffle(CommonMask, CommonMask);
13270 Vec = createShuffle(Vec,
nullptr, CommonMask);
13271 transformMaskAfterShuffle(CommonMask, CommonMask);
13273 V1 = createShuffle(V1, V2, Mask);
13274 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
13276 CommonMask[Idx] = Idx + Sz;
13277 InVectors.
front() = Vec;
13278 if (InVectors.
size() == 2)
13279 InVectors.
back() = V1;
13286 "castToScalarTyElem expects V1 to be FixedVectorType");
13287 V1 = castToScalarTyElem(V1);
13288 if (InVectors.
empty()) {
13290 CommonMask.
assign(Mask.begin(), Mask.end());
13293 const auto *It =
find(InVectors, V1);
13294 if (It == InVectors.
end()) {
13295 if (InVectors.
size() == 2 ||
13298 if (InVectors.
size() == 2) {
13299 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
13300 transformMaskAfterShuffle(CommonMask, CommonMask);
13302 CommonMask.
size()) {
13303 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
13304 transformMaskAfterShuffle(CommonMask, CommonMask);
13306 unsigned VF = std::max(CommonMask.
size(), Mask.size());
13307 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
13310 V->getType() != V1->
getType()
13313 ->getNumElements();
13314 if (V->getType() != V1->
getType())
13315 V1 = createShuffle(V1,
nullptr, Mask);
13316 InVectors.
front() = V;
13317 if (InVectors.
size() == 2)
13318 InVectors.
back() = V1;
13325 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
13331 int VF = getVF(V1);
13332 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
13334 CommonMask[Idx] = Mask[Idx] + (It == InVectors.
begin() ? 0 : VF);
13343 Value *Root =
nullptr) {
13344 return R.gather(VL, Root, ScalarTy);
13352 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
13355 IsFinalized =
true;
13363 ExtMask = NewExtMask;
13367 if (InVectors.
size() == 2) {
13368 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
13371 Vec = createShuffle(Vec,
nullptr, CommonMask);
13373 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
13375 CommonMask[Idx] = Idx;
13377 "Expected vector length for the final value before action.");
13381 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
13382 Vec = createShuffle(Vec,
nullptr, ResizeMask);
13384 Action(Vec, CommonMask);
13385 InVectors.
front() = Vec;
13387 if (!SubVectors.empty()) {
13389 if (InVectors.
size() == 2) {
13390 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
13393 Vec = createShuffle(Vec,
nullptr, CommonMask);
13395 for (
unsigned Idx = 0, Sz = CommonMask.
size(); Idx < Sz; ++Idx)
13397 CommonMask[Idx] = Idx;
13398 for (
auto [E, Idx] : SubVectors) {
13399 Value *V = castToScalarTyElem(E->VectorizedValue);
13400 Vec = Builder.CreateInsertVector(Vec->
getType(), Vec, V,
13401 Builder.getInt64(Idx));
13402 if (!CommonMask.
empty()) {
13403 std::iota(std::next(CommonMask.
begin(), Idx),
13404 std::next(CommonMask.
begin(), Idx + E->getVectorFactor()),
13408 InVectors.
front() = Vec;
13411 if (!ExtMask.
empty()) {
13412 if (CommonMask.
empty()) {
13416 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
13419 NewMask[
I] = CommonMask[ExtMask[
I]];
13421 CommonMask.
swap(NewMask);
13424 if (CommonMask.
empty()) {
13425 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
13426 return InVectors.
front();
13428 if (InVectors.
size() == 2)
13429 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
13430 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
13435 "Shuffle construction must be finalized.");
13439BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
13440 unsigned NodeIdx) {
13444 if (!S.getOpcode() && VL.
front()->getType()->isPointerTy()) {
13446 if (It != VL.
end())
13449 if (!S.getOpcode())
13451 auto CheckSameVE = [&](
const TreeEntry *VE) {
13452 return VE->isSame(VL) &&
13453 (
any_of(VE->UserTreeIndices,
13454 [E, NodeIdx](
const EdgeInfo &EI) {
13455 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
13457 any_of(VectorizableTree,
13458 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
13459 return TE->isOperandGatherNode(
13460 {
const_cast<TreeEntry *
>(
E), NodeIdx}) &&
13461 VE->isSame(TE->Scalars);
13464 TreeEntry *VE = getTreeEntry(S.OpValue);
13465 if (VE && CheckSameVE(VE))
13467 auto It = MultiNodeScalars.
find(S.OpValue);
13468 if (It != MultiNodeScalars.
end()) {
13469 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
13470 return TE != VE && CheckSameVE(TE);
13472 if (
I != It->getSecond().end())
13478Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
13479 bool PostponedPHIs) {
13480 ValueList &VL = E->getOperand(NodeIdx);
13481 const unsigned VF = VL.size();
13482 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
13489 ShuffleInstructionBuilder ShuffleBuilder(
13493 ShuffleBuilder.add(V, Mask);
13495 E->CombinedEntriesWithIndices.size());
13496 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
13497 [&](
const auto &
P) {
13498 return std::make_pair(VectorizableTree[P.first].get(),
13501 return ShuffleBuilder.finalize({}, SubVectors);
13506 if (!VE->ReuseShuffleIndices.empty()) {
13529 Mask[
I] = VE->findLaneForValue(V);
13531 V = FinalShuffle(V, Mask);
13534 "Expected vectorization factor less "
13535 "than original vector size.");
13537 std::iota(UniformMask.begin(), UniformMask.end(), 0);
13538 V = FinalShuffle(V, UniformMask);
13544 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
13545 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
13546 }) == VE->UserTreeIndices.end()) {
13548 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
13549 return TE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
13550 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
13552 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
13553 (*It)->VectorizedValue = V;
13561 auto *
I =
find_if(VectorizableTree,
13562 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
13563 return TE->isOperandGatherNode({
E, NodeIdx});
13565 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
13566 assert(
I->get()->UserTreeIndices.size() == 1 &&
13567 "Expected only single user for the gather node.");
13568 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
13572template <
typename BVTy,
typename ResTy,
typename...
Args>
13573ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
13575 assert(E->isGather() &&
"Expected gather node.");
13576 unsigned VF = E->getVectorFactor();
13578 bool NeedFreeze =
false;
13580 E->ReuseShuffleIndices.end());
13583 for (
auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
13585 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
13588 E->CombinedEntriesWithIndices.size());
13589 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
13590 [&](
const auto &
P) {
13591 return std::make_pair(VectorizableTree[P.first].get(), P.second);
13597 if (!ReorderMask.
empty())
13600 unsigned I,
unsigned SliceSize) {
13602 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
13605 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
13606 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
13607 if (UserTE->getNumOperands() != 2)
13610 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
13611 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
13612 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
13613 }) !=
TE->UserTreeIndices.end();
13615 if (It == VectorizableTree.end())
13618 if ((
Mask.size() < InputVF &&
13621 (
Mask.size() == InputVF &&
13624 std::next(
Mask.begin(),
I * SliceSize),
13625 std::next(
Mask.begin(),
13632 std::next(
Mask.begin(),
I * SliceSize),
13633 std::next(
Mask.begin(),
13639 BVTy ShuffleBuilder(ScalarTy, Params...);
13640 ResTy Res = ResTy();
13644 Value *ExtractVecBase =
nullptr;
13645 bool UseVecBaseAsInput =
false;
13648 Type *OrigScalarTy = GatheredScalars.front()->getType();
13651 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
13658 bool Resized =
false;
13660 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
13661 if (!ExtractShuffles.
empty()) {
13663 for (
auto [Idx,
I] :
enumerate(ExtractMask)) {
13666 if (
const auto *TE = getTreeEntry(
13670 if (std::optional<ResTy> Delayed =
13671 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
13673 PostponedGathers.
insert(E);
13678 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
13679 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
13680 ExtractVecBase = VecBase;
13682 if (VF == VecBaseTy->getNumElements() &&
13683 GatheredScalars.size() != VF) {
13685 GatheredScalars.append(VF - GatheredScalars.size(),
13691 if (!ExtractShuffles.
empty() || E->getOpcode() != Instruction::Load ||
13692 ((E->getOpcode() == Instruction::Load ||
13696 return isa<LoadInst>(V) && getTreeEntry(V);
13698 E->isAltShuffle() ||
13699 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
13701 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
13703 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
13705 if (!GatherShuffles.
empty()) {
13706 if (std::optional<ResTy> Delayed =
13707 ShuffleBuilder.needToDelay(E, Entries)) {
13709 PostponedGathers.
insert(E);
13714 if (GatherShuffles.
size() == 1 &&
13716 Entries.front().front()->isSame(E->Scalars)) {
13719 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
13722 Mask.resize(E->Scalars.size());
13723 const TreeEntry *FrontTE = Entries.front().front();
13724 if (FrontTE->ReorderIndices.empty() &&
13725 ((FrontTE->ReuseShuffleIndices.empty() &&
13726 E->Scalars.size() == FrontTE->Scalars.size()) ||
13727 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
13728 std::iota(
Mask.begin(),
Mask.end(), 0);
13735 Mask[
I] = FrontTE->findLaneForValue(V);
13738 ShuffleBuilder.add(*FrontTE, Mask);
13739 Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors);
13743 if (GatheredScalars.size() != VF &&
13745 return any_of(TEs, [&](
const TreeEntry *TE) {
13746 return TE->getVectorFactor() == VF;
13749 GatheredScalars.append(VF - GatheredScalars.size(),
13753 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
13761 bool IsRootPoison) {
13764 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
13771 int NumNonConsts = 0;
13790 Scalars.
front() = OrigV;
13793 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
13794 Scalars[Res.first->second] = OrigV;
13795 ReuseMask[
I] = Res.first->second;
13798 if (NumNonConsts == 1) {
13803 if (!UndefPos.
empty() && UndefPos.
front() == 0)
13806 ReuseMask[SinglePos] = SinglePos;
13807 }
else if (!UndefPos.
empty() && IsSplat) {
13814 (E->UserTreeIndices.size() == 1 &&
13818 return E->UserTreeIndices.front().EdgeIdx !=
13819 U.getOperandNo() &&
13821 E->UserTreeIndices.front().UserTE->Scalars,
13825 if (It != Scalars.
end()) {
13827 int Pos = std::distance(Scalars.
begin(), It);
13828 for (
int I : UndefPos) {
13830 ReuseMask[
I] = Pos;
13839 for (
int I : UndefPos) {
13848 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
13849 bool IsNonPoisoned =
true;
13850 bool IsUsedInExpr =
true;
13851 Value *Vec1 =
nullptr;
13852 if (!ExtractShuffles.
empty()) {
13856 Value *Vec2 =
nullptr;
13857 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
13861 if (UseVecBaseAsInput) {
13862 Vec1 = ExtractVecBase;
13864 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
13870 Value *VecOp = EI->getVectorOperand();
13871 if (
const auto *TE = getTreeEntry(VecOp))
13872 if (
TE->VectorizedValue)
13873 VecOp =
TE->VectorizedValue;
13876 }
else if (Vec1 != VecOp) {
13877 assert((!Vec2 || Vec2 == VecOp) &&
13878 "Expected only 1 or 2 vectors shuffle.");
13884 IsUsedInExpr =
false;
13887 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
13889 IsUsedInExpr &= FindReusedSplat(
13892 ExtractMask.size());
13893 ShuffleBuilder.add(Vec1, ExtractMask,
true);
13896 IsUsedInExpr =
false;
13901 if (!GatherShuffles.
empty()) {
13904 for (
const auto [
I, TEs] :
enumerate(Entries)) {
13907 "No shuffles with empty entries list expected.");
13911 "Expected shuffle of 1 or 2 entries.");
13915 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
13916 if (TEs.
size() == 1) {
13917 IsUsedInExpr &= FindReusedSplat(
13918 VecMask, TEs.
front()->getVectorFactor(),
I, SliceSize);
13919 ShuffleBuilder.add(*TEs.
front(), VecMask);
13920 if (TEs.
front()->VectorizedValue)
13924 IsUsedInExpr =
false;
13925 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
13926 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
13937 int EMSz = ExtractMask.size();
13938 int MSz =
Mask.size();
13941 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
13942 bool IsIdentityShuffle =
13943 ((UseVecBaseAsInput ||
13945 [](
const std::optional<TTI::ShuffleKind> &SK) {
13949 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
13951 (!GatherShuffles.
empty() &&
13953 [](
const std::optional<TTI::ShuffleKind> &SK) {
13957 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
13959 bool EnoughConstsForShuffle =
13969 (!IsIdentityShuffle ||
13970 (GatheredScalars.size() == 2 &&
13978 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
13979 if (EnoughConstsForShuffle &&
isa<Constant>(GatheredScalars[
I]))
13987 TryPackScalars(GatheredScalars, BVMask,
true);
13988 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
13989 ShuffleBuilder.add(BV, BVMask);
13993 (IsSingleShuffle && ((IsIdentityShuffle &&
13996 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
13998 Res = ShuffleBuilder.finalize(
13999 E->ReuseShuffleIndices, SubVectors, E->Scalars.size(),
14001 TryPackScalars(NonConstants, Mask, false);
14002 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
14007 TryPackScalars(GatheredScalars, ReuseMask,
true);
14008 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
14009 ShuffleBuilder.add(BV, ReuseMask);
14010 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
14014 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
14018 Value *BV = ShuffleBuilder.gather(GatheredScalars);
14019 ShuffleBuilder.add(BV, Mask);
14020 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
14024 Res = ShuffleBuilder.createFreeze(Res);
14028Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14029 bool PostponedPHIs) {
14030 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
14032 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
14039 if (E->VectorizedValue &&
14040 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
14041 E->isAltShuffle())) {
14042 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
14043 return E->VectorizedValue;
14046 Value *V = E->Scalars.front();
14047 Type *ScalarTy = V->getType();
14050 auto It = MinBWs.find(E);
14051 if (It != MinBWs.end()) {
14058 if (E->isGather()) {
14060 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
14061 setInsertPointAfterBundle(E);
14062 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
14063 E->VectorizedValue = Vec;
14068 auto FinalShuffle = [&](
Value *V,
const TreeEntry *
E) {
14069 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
14070 if (E->getOpcode() == Instruction::Store &&
14071 E->State == TreeEntry::Vectorize) {
14073 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
14074 E->ReorderIndices.size());
14075 ShuffleBuilder.add(V, Mask);
14076 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
14077 ShuffleBuilder.addOrdered(V, {});
14079 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
14082 E->CombinedEntriesWithIndices.size());
14084 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
14085 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14087 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
14090 assert(!E->isGather() &&
"Unhandled state");
14091 unsigned ShuffleOrOp =
14094 auto GetOperandSignedness = [&](
unsigned Idx) {
14095 const TreeEntry *OpE = getOperandEntry(E, Idx);
14096 bool IsSigned =
false;
14097 auto It = MinBWs.find(OpE);
14098 if (It != MinBWs.end())
14099 IsSigned = It->second.second;
14102 return !isKnownNonNegative(R, SimplifyQuery(*DL));
14106 switch (ShuffleOrOp) {
14107 case Instruction::PHI: {
14108 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
14109 E != VectorizableTree.front().get() ||
14110 !E->UserTreeIndices.empty()) &&
14111 "PHI reordering is free.");
14112 if (PostponedPHIs && E->VectorizedValue)
14113 return E->VectorizedValue;
14115 Builder.SetInsertPoint(PH->getParent(),
14116 PH->getParent()->getFirstNonPHIIt());
14117 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
14118 if (PostponedPHIs || !E->VectorizedValue) {
14119 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
14124 Builder.SetInsertPoint(PH->getParent(),
14125 PH->getParent()->getFirstInsertionPt());
14126 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
14128 V = FinalShuffle(V, E);
14130 E->VectorizedValue = V;
14143 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
14149 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14153 if (!VisitedBBs.
insert(IBB).second) {
14159 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
14160 Value *Vec = vectorizeOperand(E,
I,
true);
14161 if (VecTy != Vec->
getType()) {
14163 MinBWs.contains(getOperandEntry(E,
I))) &&
14164 "Expected item in MinBWs.");
14165 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
14171 "Invalid number of incoming values");
14172 assert(E->VectorizedValue &&
"Expected vectorized value.");
14173 return E->VectorizedValue;
14176 case Instruction::ExtractElement: {
14177 Value *V = E->getSingleOperand(0);
14178 if (
const TreeEntry *TE = getTreeEntry(V))
14179 V =
TE->VectorizedValue;
14180 setInsertPointAfterBundle(E);
14181 V = FinalShuffle(V, E);
14182 E->VectorizedValue = V;
14185 case Instruction::ExtractValue: {
14187 Builder.SetInsertPoint(LI);
14191 NewV = FinalShuffle(NewV, E);
14192 E->VectorizedValue = NewV;
14195 case Instruction::InsertElement: {
14196 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
14198 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
14200 Type *ScalarTy =
Op.front()->getType();
14203 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
14204 assert(Res.first > 0 &&
"Expected item in MinBWs.");
14205 V = Builder.CreateIntCast(
14215 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14217 const unsigned NumElts =
14219 const unsigned NumScalars = E->Scalars.size();
14222 assert(
Offset < NumElts &&
"Failed to find vector index offset");
14226 if (!E->ReorderIndices.empty()) {
14231 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
14234 bool IsIdentity =
true;
14236 Mask.swap(PrevMask);
14237 for (
unsigned I = 0;
I < NumScalars; ++
I) {
14240 IsIdentity &= InsertIdx -
Offset ==
I;
14243 if (!IsIdentity || NumElts != NumScalars) {
14247 if (NumElts != NumScalars &&
Offset == 0) {
14256 InsertMask[*InsertIdx] = *InsertIdx;
14257 if (!
Ins->hasOneUse())
14260 Ins->getUniqueUndroppableUser());
14263 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
14268 if (!IsFirstPoison.
all()) {
14270 for (
unsigned I = 0;
I < NumElts;
I++) {
14272 IsFirstUndef.
test(
I)) {
14273 if (IsVNonPoisonous) {
14274 InsertMask[
I] =
I < NumScalars ?
I : 0;
14279 if (Idx >= NumScalars)
14280 Idx = NumScalars - 1;
14281 InsertMask[
I] = NumScalars + Idx;
14294 V = Builder.CreateShuffleVector(V, V2, InsertMask);
14296 GatherShuffleExtractSeq.
insert(
I);
14297 CSEBlocks.
insert(
I->getParent());
14302 for (
unsigned I = 0;
I < NumElts;
I++) {
14307 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
14310 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
14311 NumElts != NumScalars) {
14312 if (IsFirstUndef.
all()) {
14316 if (!IsFirstPoison.
all()) {
14317 for (
unsigned I = 0;
I < NumElts;
I++) {
14319 InsertMask[
I] =
I + NumElts;
14322 V = Builder.CreateShuffleVector(
14328 GatherShuffleExtractSeq.
insert(
I);
14329 CSEBlocks.
insert(
I->getParent());
14335 for (
unsigned I = 0;
I < NumElts;
I++) {
14339 InsertMask[
I] += NumElts;
14341 V = Builder.CreateShuffleVector(
14342 FirstInsert->getOperand(0), V, InsertMask,
14345 GatherShuffleExtractSeq.
insert(
I);
14346 CSEBlocks.
insert(
I->getParent());
14351 ++NumVectorInstructions;
14352 E->VectorizedValue = V;
14355 case Instruction::ZExt:
14356 case Instruction::SExt:
14357 case Instruction::FPToUI:
14358 case Instruction::FPToSI:
14359 case Instruction::FPExt:
14360 case Instruction::PtrToInt:
14361 case Instruction::IntToPtr:
14362 case Instruction::SIToFP:
14363 case Instruction::UIToFP:
14364 case Instruction::Trunc:
14365 case Instruction::FPTrunc:
14366 case Instruction::BitCast: {
14367 setInsertPointAfterBundle(E);
14369 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
14370 if (E->VectorizedValue) {
14371 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14372 return E->VectorizedValue;
14378 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14380 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
14383 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
14384 if (SrcIt != MinBWs.end())
14385 SrcBWSz = SrcIt->second.first;
14387 if (BWSz == SrcBWSz) {
14388 VecOpcode = Instruction::BitCast;
14389 }
else if (BWSz < SrcBWSz) {
14390 VecOpcode = Instruction::Trunc;
14391 }
else if (It != MinBWs.end()) {
14392 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14393 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14394 }
else if (SrcIt != MinBWs.end()) {
14395 assert(BWSz > SrcBWSz &&
"Invalid cast!");
14397 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14399 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14400 !SrcIt->second.second) {
14401 VecOpcode = Instruction::UIToFP;
14403 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
14405 : Builder.CreateCast(VecOpcode, InVec, VecTy);
14406 V = FinalShuffle(V, E);
14408 E->VectorizedValue = V;
14409 ++NumVectorInstructions;
14412 case Instruction::FCmp:
14413 case Instruction::ICmp: {
14414 setInsertPointAfterBundle(E);
14416 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
14417 if (E->VectorizedValue) {
14418 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14419 return E->VectorizedValue;
14421 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
14422 if (E->VectorizedValue) {
14423 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14424 return E->VectorizedValue;
14426 if (
L->getType() !=
R->getType()) {
14428 getOperandEntry(E, 1)->
isGather() ||
14429 MinBWs.contains(getOperandEntry(E, 0)) ||
14430 MinBWs.contains(getOperandEntry(E, 1))) &&
14431 "Expected item in MinBWs.");
14436 ->getIntegerBitWidth()) {
14437 Type *CastTy =
R->getType();
14438 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
14440 Type *CastTy =
L->getType();
14441 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
14446 Value *V = Builder.CreateCmp(P0, L, R);
14450 V = FinalShuffle(V, E);
14452 E->VectorizedValue = V;
14453 ++NumVectorInstructions;
14456 case Instruction::Select: {
14457 setInsertPointAfterBundle(E);
14459 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
14460 if (E->VectorizedValue) {
14461 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14462 return E->VectorizedValue;
14464 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
14465 if (E->VectorizedValue) {
14466 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14467 return E->VectorizedValue;
14469 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
14470 if (E->VectorizedValue) {
14471 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14472 return E->VectorizedValue;
14475 assert((It != MinBWs.end() || getOperandEntry(E, 1)->
isGather() ||
14476 getOperandEntry(E, 2)->
isGather() ||
14477 MinBWs.contains(getOperandEntry(E, 1)) ||
14478 MinBWs.contains(getOperandEntry(E, 2))) &&
14479 "Expected item in MinBWs.");
14480 if (True->
getType() != VecTy)
14481 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
14482 if (False->
getType() != VecTy)
14483 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
14488 assert(TrueNumElements >= CondNumElements &&
14489 TrueNumElements % CondNumElements == 0 &&
14490 "Cannot vectorize Instruction::Select");
14492 "Cannot vectorize Instruction::Select");
14493 if (CondNumElements != TrueNumElements) {
14496 Cond = Builder.CreateShuffleVector(
14501 "Cannot vectorize Instruction::Select");
14502 Value *V = Builder.CreateSelect(
Cond, True, False);
14503 V = FinalShuffle(V, E);
14505 E->VectorizedValue = V;
14506 ++NumVectorInstructions;
14509 case Instruction::FNeg: {
14510 setInsertPointAfterBundle(E);
14512 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
14514 if (E->VectorizedValue) {
14515 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14516 return E->VectorizedValue;
14519 Value *V = Builder.CreateUnOp(
14525 V = FinalShuffle(V, E);
14527 E->VectorizedValue = V;
14528 ++NumVectorInstructions;
14532 case Instruction::Freeze: {
14533 setInsertPointAfterBundle(E);
14535 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
14537 if (E->VectorizedValue) {
14538 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14539 return E->VectorizedValue;
14542 Value *V = Builder.CreateFreeze(
Op);
14543 V = FinalShuffle(V, E);
14545 E->VectorizedValue = V;
14546 ++NumVectorInstructions;
14550 case Instruction::Add:
14551 case Instruction::FAdd:
14552 case Instruction::Sub:
14553 case Instruction::FSub:
14554 case Instruction::Mul:
14555 case Instruction::FMul:
14556 case Instruction::UDiv:
14557 case Instruction::SDiv:
14558 case Instruction::FDiv:
14559 case Instruction::URem:
14560 case Instruction::SRem:
14561 case Instruction::FRem:
14562 case Instruction::Shl:
14563 case Instruction::LShr:
14564 case Instruction::AShr:
14565 case Instruction::And:
14566 case Instruction::Or:
14567 case Instruction::Xor: {
14568 setInsertPointAfterBundle(E);
14570 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
14571 if (E->VectorizedValue) {
14572 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14573 return E->VectorizedValue;
14575 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
14576 if (E->VectorizedValue) {
14577 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14578 return E->VectorizedValue;
14580 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14585 return CI && CI->getValue().countr_one() >= It->second.first;
14587 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
14588 E->VectorizedValue = V;
14589 ++NumVectorInstructions;
14594 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
14595 assert((It != MinBWs.end() || getOperandEntry(E, 0)->
isGather() ||
14596 getOperandEntry(E, 1)->
isGather() ||
14597 MinBWs.contains(getOperandEntry(E, 0)) ||
14598 MinBWs.contains(getOperandEntry(E, 1))) &&
14599 "Expected item in MinBWs.");
14600 if (LHS->getType() != VecTy)
14601 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
14602 if (RHS->getType() != VecTy)
14603 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
14606 Value *V = Builder.CreateBinOp(
14613 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
14615 return isCommutative(cast<Instruction>(V));
14617 I->setHasNoUnsignedWrap(
false);
14620 V = FinalShuffle(V, E);
14622 E->VectorizedValue = V;
14623 ++NumVectorInstructions;
14627 case Instruction::Load: {
14630 setInsertPointAfterBundle(E);
14635 if (E->State == TreeEntry::Vectorize) {
14636 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->
getAlign());
14637 }
else if (E->State == TreeEntry::StridedVectorize) {
14640 PO = IsReverseOrder ? PtrN : Ptr0;
14646 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
14648 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
14649 DL->getTypeAllocSize(ScalarTy));
14653 return cast<LoadInst>(V)->getPointerOperand();
14656 std::optional<Value *> Stride =
14658 &*Builder.GetInsertPoint());
14660 Builder.CreateIntCast(*Stride, StrideTy,
true);
14661 StrideVal = Builder.CreateMul(
14665 (IsReverseOrder ? -1 : 1) *
14666 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
14669 auto *Inst = Builder.CreateIntrinsic(
14670 Intrinsic::experimental_vp_strided_load,
14671 {VecTy, PO->
getType(), StrideTy},
14673 Builder.getInt32(E->Scalars.size())});
14674 Inst->addParamAttr(
14679 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
14680 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
14681 if (E->VectorizedValue) {
14682 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14683 return E->VectorizedValue;
14689 unsigned ScalarTyNumElements =
14691 unsigned VecTyNumElements =
14693 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
14694 "Cannot expand getelementptr.");
14695 unsigned VF = VecTyNumElements / ScalarTyNumElements;
14697 transform(
seq(VecTyNumElements), Indices.begin(), [=](
unsigned I) {
14698 return Builder.getInt64(I % ScalarTyNumElements);
14700 VecPtr = Builder.CreateGEP(
14702 Builder.CreateShuffleVector(
14708 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
14712 V = FinalShuffle(V, E);
14713 E->VectorizedValue = V;
14714 ++NumVectorInstructions;
14717 case Instruction::Store: {
14720 setInsertPointAfterBundle(E);
14722 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
14723 if (VecValue->
getType() != VecTy)
14725 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
14726 VecValue = FinalShuffle(VecValue, E);
14730 if (E->State == TreeEntry::Vectorize) {
14731 ST = Builder.CreateAlignedStore(VecValue,
Ptr,
SI->getAlign());
14733 assert(E->State == TreeEntry::StridedVectorize &&
14734 "Expected either strided or consecutive stores.");
14735 if (!E->ReorderIndices.empty()) {
14737 Ptr =
SI->getPointerOperand();
14740 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
14741 auto *Inst = Builder.CreateIntrinsic(
14742 Intrinsic::experimental_vp_strided_store,
14743 {VecTy,
Ptr->getType(), StrideTy},
14746 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
14748 Builder.getInt32(E->Scalars.size())});
14749 Inst->addParamAttr(
14757 E->VectorizedValue = V;
14758 ++NumVectorInstructions;
14761 case Instruction::GetElementPtr: {
14763 setInsertPointAfterBundle(E);
14765 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
14766 if (E->VectorizedValue) {
14767 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14768 return E->VectorizedValue;
14772 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
14773 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
14774 if (E->VectorizedValue) {
14775 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14776 return E->VectorizedValue;
14781 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
14784 for (
Value *V : E->Scalars) {
14791 V = FinalShuffle(V, E);
14793 E->VectorizedValue = V;
14794 ++NumVectorInstructions;
14798 case Instruction::Call: {
14800 setInsertPointAfterBundle(E);
14806 It != MinBWs.end() ? It->second.first : 0);
14809 VecCallCosts.first <= VecCallCosts.second;
14811 Value *ScalarArg =
nullptr;
14823 ScalarArg = CEI->getArgOperand(
I);
14826 if (ID == Intrinsic::abs && It != MinBWs.end() &&
14827 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
14828 ScalarArg = Builder.getFalse();
14835 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
14836 if (E->VectorizedValue) {
14837 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14838 return E->VectorizedValue;
14840 ScalarArg = CEI->getArgOperand(
I);
14843 It == MinBWs.end()) {
14846 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
14847 }
else if (It != MinBWs.end()) {
14848 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
14857 if (!UseIntrinsic) {
14870 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
14873 V = FinalShuffle(V, E);
14875 E->VectorizedValue = V;
14876 ++NumVectorInstructions;
14879 case Instruction::ShuffleVector: {
14881 if (
SLPReVec && !E->isAltShuffle()) {
14882 assert(E->ReuseShuffleIndices.empty() &&
14883 "Not support ReuseShuffleIndices yet.");
14884 assert(E->ReorderIndices.empty() &&
"Not support ReorderIndices yet.");
14885 setInsertPointAfterBundle(E);
14886 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
14887 if (E->VectorizedValue) {
14888 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14889 return E->VectorizedValue;
14892 "Not supported shufflevector usage.");
14895 "Not supported shufflevector usage.");
14899 [&SVSrc](
int Mask) { return SVSrc->getShuffleMask()[Mask]; });
14900 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
14903 assert(E->isAltShuffle() &&
14909 "Invalid Shuffle Vector Operand");
14911 Value *LHS =
nullptr, *RHS =
nullptr;
14913 setInsertPointAfterBundle(E);
14914 LHS = vectorizeOperand(E, 0, PostponedPHIs);
14915 if (E->VectorizedValue) {
14916 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14917 return E->VectorizedValue;
14919 RHS = vectorizeOperand(E, 1, PostponedPHIs);
14921 setInsertPointAfterBundle(E);
14922 LHS = vectorizeOperand(E, 0, PostponedPHIs);
14924 if (E->VectorizedValue) {
14925 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
14926 return E->VectorizedValue;
14930 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
14931 (
isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
14932 assert((It != MinBWs.end() ||
14933 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
14934 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
14935 MinBWs.contains(getOperandEntry(E, 0)) ||
14936 MinBWs.contains(getOperandEntry(E, 1))) &&
14937 "Expected item in MinBWs.");
14938 Type *CastTy = VecTy;
14939 if (
isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
14944 ->getIntegerBitWidth())
14945 CastTy = RHS->getType();
14947 CastTy = LHS->getType();
14949 if (LHS->getType() != CastTy)
14950 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
14951 if (RHS->getType() != CastTy)
14952 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
14957 V0 = Builder.CreateBinOp(
14959 V1 = Builder.CreateBinOp(
14962 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
14965 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
14967 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->
isIntegerTy()) {
14968 unsigned SrcBWSz =
DL->getTypeSizeInBits(
14970 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
14971 if (BWSz <= SrcBWSz) {
14972 if (BWSz < SrcBWSz)
14973 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
14974 assert(LHS->getType() == VecTy &&
14975 "Expected same type as operand.");
14978 LHS = FinalShuffle(LHS, E);
14979 E->VectorizedValue =
LHS;
14980 ++NumVectorInstructions;
14984 V0 = Builder.CreateCast(
14986 V1 = Builder.CreateCast(
14991 for (
Value *V : {V0, V1}) {
14993 GatherShuffleExtractSeq.
insert(
I);
14994 CSEBlocks.
insert(
I->getParent());
15003 E->buildAltOpShuffleMask(
15005 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
15009 Mask, &OpScalars, &AltScalars);
15013 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
15016 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
15018 auto *IV = cast<Instruction>(V);
15019 return IV->getOpcode() == Instruction::Sub &&
15020 isCommutative(cast<Instruction>(IV));
15022 I->setHasNoUnsignedWrap(
false);
15024 DropNuwFlag(V0, E->getOpcode());
15025 DropNuwFlag(V1, E->getAltOpcode());
15031 V = Builder.CreateShuffleVector(V0, V1, Mask);
15035 GatherShuffleExtractSeq.
insert(
I);
15036 CSEBlocks.
insert(
I->getParent());
15039 E->VectorizedValue = V;
15040 ++NumVectorInstructions;
15059 for (
auto &BSIter : BlocksSchedules) {
15060 scheduleBlock(BSIter.second.get());
15064 EntryToLastInstruction.
clear();
15067 Builder.SetInsertPoint(ReductionRoot->
getParent(),
15070 Builder.SetInsertPoint(&
F->getEntryBlock(),
F->getEntryBlock().begin());
15074 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15075 if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
15076 TE->Idx >= GatheredLoadsEntriesFirst &&
15077 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
15078 assert((!TE->UserTreeIndices.empty() ||
15079 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
15080 "Expected gathered load node.");
15086 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
15087 if (TE->State == TreeEntry::Vectorize &&
15088 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
15089 TE->VectorizedValue)
15095 for (
const TreeEntry *E : PostponedNodes) {
15096 auto *TE =
const_cast<TreeEntry *
>(
E);
15097 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
15098 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
15099 TE->UserTreeIndices.front().EdgeIdx)) &&
15100 VecTE->isSame(TE->Scalars))
15105 TE->VectorizedValue =
nullptr;
15125 if (UI->comesBefore(InsertPt))
15128 Builder.SetInsertPoint(InsertPt);
15130 Builder.SetInsertPoint(PrevVec);
15132 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
15134 if (Vec->
getType() != PrevVec->getType()) {
15136 PrevVec->getType()->isIntOrIntVectorTy() &&
15137 "Expected integer vector types only.");
15138 std::optional<bool> IsSigned;
15139 for (
Value *V : TE->Scalars) {
15140 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
15141 auto It = MinBWs.find(BaseTE);
15142 if (It != MinBWs.end()) {
15143 IsSigned = IsSigned.value_or(
false) || It->second.second;
15147 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
15148 auto It = MinBWs.find(MNTE);
15149 if (It != MinBWs.end()) {
15150 IsSigned = IsSigned.value_or(
false) || It->second.second;
15155 if (IsSigned.value_or(
false))
15158 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
15159 auto It = MinBWs.find(BVE);
15160 if (It != MinBWs.end()) {
15161 IsSigned = IsSigned.value_or(
false) || It->second.second;
15166 if (IsSigned.value_or(
false))
15170 IsSigned.value_or(
false) ||
15174 if (IsSigned.value_or(
false))
15178 if (IsSigned.value_or(
false)) {
15180 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
15181 if (It != MinBWs.end())
15182 IsSigned = It->second.second;
15185 "Expected user node or perfect diamond match in MinBWs.");
15186 Vec = Builder.CreateIntCast(Vec, PrevVec->
getType(), *IsSigned);
15188 PrevVec->replaceAllUsesWith(Vec);
15189 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
15192 auto It = PostponedValues.
find(PrevVec);
15193 if (It != PostponedValues.
end()) {
15194 for (TreeEntry *VTE : It->getSecond())
15195 VTE->VectorizedValue = Vec;
15215 for (
const auto &ExternalUse : ExternalUses) {
15216 Value *Scalar = ExternalUse.Scalar;
15223 TreeEntry *E = getTreeEntry(Scalar);
15224 assert(E &&
"Invalid scalar");
15225 assert(!E->isGather() &&
"Extracting from a gather list");
15227 if (E->getOpcode() == Instruction::GetElementPtr &&
15231 Value *Vec = E->VectorizedValue;
15232 assert(Vec &&
"Can't find vectorizable value");
15234 Value *Lane = Builder.getInt32(ExternalUse.Lane);
15235 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
15236 if (Scalar->getType() != Vec->
getType()) {
15237 Value *Ex =
nullptr;
15238 Value *ExV =
nullptr;
15240 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
15241 auto It = ScalarToEEs.
find(Scalar);
15242 if (It != ScalarToEEs.
end()) {
15245 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
15246 : Builder.GetInsertBlock());
15247 if (EEIt != It->second.end()) {
15248 Value *PrevV = EEIt->second.first;
15250 I && !ReplaceInst &&
15251 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
15252 Builder.GetInsertPoint()->comesBefore(
I)) {
15253 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
15254 Builder.GetInsertPoint());
15259 ExV = EEIt->second.second ? EEIt->second.second : Ex;
15268 IgnoredExtracts.
insert(EE);
15271 auto *CloneInst = Inst->clone();
15272 CloneInst->insertBefore(Inst);
15273 if (Inst->hasName())
15274 CloneInst->takeName(Inst);
15279 Value *V = ES->getVectorOperand();
15281 if (
const TreeEntry *ETE = getTreeEntry(V))
15282 V = ETE->VectorizedValue;
15284 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
15285 IV->comesBefore(IVec))
15286 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
15288 Ex = Builder.CreateExtractElement(Vec, Lane);
15289 }
else if (
auto *VecTy =
15296 Ex = Builder.CreateExtractVector(
15299 Vec, Builder.getInt64(ExternalUse.Lane * VecTyNumElements));
15301 Ex = Builder.CreateExtractElement(Vec, Lane);
15306 if (Scalar->getType() != Ex->
getType())
15307 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
15308 MinBWs.find(E)->second.second);
15311 : &
F->getEntryBlock(),
15312 std::make_pair(Ex, ExV));
15318 GatherShuffleExtractSeq.
insert(ExI);
15319 CSEBlocks.
insert(ExI->getParent());
15325 "In-tree scalar of vector type is not insertelement?");
15334 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
15338 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
15341 if (ExternalUsesAsOriginalScalar.contains(U))
15343 TreeEntry *UseEntry = getTreeEntry(U);
15345 (UseEntry->State == TreeEntry::Vectorize ||
15347 TreeEntry::StridedVectorize) &&
15348 (E->State == TreeEntry::Vectorize ||
15349 E->State == TreeEntry::StridedVectorize) &&
15350 doesInTreeUserNeedToExtract(
15351 Scalar, getRootEntryInstruction(*UseEntry),
15354 "Scalar with nullptr User must be registered in "
15355 "ExternallyUsedValues map or remain as scalar in vectorized "
15359 if (
PHI->getParent()->isLandingPad())
15360 Builder.SetInsertPoint(
15363 PHI->getParent()->getLandingPadInst()->getIterator()));
15365 Builder.SetInsertPoint(
PHI->getParent(),
15366 PHI->getParent()->getFirstNonPHIIt());
15368 Builder.SetInsertPoint(VecI->getParent(),
15369 std::next(VecI->getIterator()));
15372 Builder.SetInsertPoint(&
F->getEntryBlock(),
F->getEntryBlock().begin());
15374 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
15376 if (Scalar != NewInst) {
15379 "Extractelements should not be replaced.");
15380 Scalar->replaceAllUsesWith(NewInst);
15390 if (!UsedInserts.
insert(VU).second)
15393 auto BWIt = MinBWs.find(E);
15395 auto *ScalarTy = FTy->getElementType();
15396 auto Key = std::make_pair(Vec, ScalarTy);
15397 auto VecIt = VectorCasts.
find(Key);
15398 if (VecIt == VectorCasts.
end()) {
15401 if (IVec->getParent()->isLandingPad())
15402 Builder.SetInsertPoint(IVec->getParent(),
15403 std::next(IVec->getParent()
15404 ->getLandingPadInst()
15407 Builder.SetInsertPoint(
15408 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
15410 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
15412 Vec = Builder.CreateIntCast(
15417 BWIt->second.second);
15420 Vec = VecIt->second;
15427 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
15434 unsigned Idx = *InsertIdx;
15435 if (It == ShuffledInserts.
end()) {
15437 It = std::next(ShuffledInserts.
begin(),
15438 ShuffledInserts.
size() - 1);
15443 Mask[Idx] = ExternalUse.Lane;
15455 for (
unsigned I :
seq<unsigned>(0, PH->getNumIncomingValues())) {
15456 if (PH->getIncomingValue(
I) == Scalar) {
15458 PH->getIncomingBlock(
I)->getTerminator();
15460 Builder.SetInsertPoint(VecI->getParent(),
15461 std::next(VecI->getIterator()));
15463 Builder.SetInsertPoint(PH->getIncomingBlock(
I)->getTerminator());
15465 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
15466 PH->setOperand(
I, NewInst);
15471 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
15475 Builder.SetInsertPoint(&
F->getEntryBlock(),
F->getEntryBlock().begin());
15476 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
15487 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
15489 CombinedMask1[
I] = Mask[
I];
15491 CombinedMask2[
I] = Mask[
I] - VF;
15495 ShuffleBuilder.
add(V1, CombinedMask1);
15497 ShuffleBuilder.
add(V2, CombinedMask2);
15498 return ShuffleBuilder.
finalize({}, {});
15502 bool ForSingleMask) {
15503 unsigned VF = Mask.size();
15506 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
15507 Vec = CreateShuffle(Vec,
nullptr, Mask);
15508 return std::make_pair(Vec,
true);
15510 if (!ForSingleMask) {
15512 for (
unsigned I = 0;
I < VF; ++
I) {
15514 ResizeMask[Mask[
I]] = Mask[
I];
15516 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
15520 return std::make_pair(Vec,
false);
15524 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
15529 Builder.SetInsertPoint(LastInsert);
15530 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
15535 return cast<VectorType>(Vec->getType())
15536 ->getElementCount()
15537 .getKnownMinValue();
15542 assert((Vals.size() == 1 || Vals.size() == 2) &&
15543 "Expected exactly 1 or 2 input values.");
15544 if (Vals.size() == 1) {
15547 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
15548 ->getNumElements() ||
15549 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
15550 return CreateShuffle(Vals.front(), nullptr, Mask);
15551 return Vals.front();
15553 return CreateShuffle(Vals.
front() ? Vals.
front()
15555 Vals.
back(), Mask);
15557 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
15560 if (It != ShuffledInserts[
I].InsertElements.
rend())
15563 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
15564 assert(
II &&
"Must be an insertelement instruction.");
15572 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
15574 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
15575 II->moveAfter(NewI);
15578 LastInsert->replaceAllUsesWith(NewInst);
15580 IE->replaceUsesOfWith(IE->getOperand(0),
15582 IE->replaceUsesOfWith(IE->getOperand(1),
15586 CSEBlocks.
insert(LastInsert->getParent());
15591 for (
auto &TEPtr : VectorizableTree) {
15592 TreeEntry *Entry = TEPtr.get();
15595 if (Entry->isGather())
15598 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
15601 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
15602 Value *Scalar = Entry->Scalars[Lane];
15604 if (Entry->getOpcode() == Instruction::GetElementPtr &&
15608 EE && IgnoredExtracts.contains(EE))
15611 Type *Ty = Scalar->getType();
15612 if (!Ty->isVoidTy()) {
15613 for (
User *U : Scalar->users()) {
15617 assert((getTreeEntry(U) ||
15618 (UserIgnoreList && UserIgnoreList->contains(U)) ||
15621 "Deleting out-of-tree value");
15625 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
15634 V->mergeDIAssignID(RemovedInsts);
15637 if (UserIgnoreList) {
15639 const TreeEntry *
IE = getTreeEntry(
I);
15640 if (
IE->Idx != 0 &&
15642 !
IE->UserTreeIndices.empty() &&
15644 [&](
const EdgeInfo &EI) {
15645 return EI.UserTE == VectorizableTree.front().get() &&
15646 EI.EdgeIdx == UINT_MAX;
15648 !(GatheredLoadsEntriesFirst != NoGatheredLoads &&
15649 IE->Idx >= GatheredLoadsEntriesFirst &&
15650 VectorizableTree.front()->isGather() &&
15656 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
15657 (match(U.getUser(), m_LogicalAnd()) ||
15658 match(U.getUser(), m_LogicalOr())) &&
15659 U.getOperandNo() == 0;
15660 if (IsPoisoningLogicalOp) {
15661 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
15664 return UserIgnoreList->contains(U.getUser());
15678 Builder.ClearInsertionPoint();
15679 InstrElementSize.
clear();
15681 const TreeEntry &RootTE = *VectorizableTree.front();
15682 Value *Vec = RootTE.VectorizedValue;
15683 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
15684 It != MinBWs.end() &&
15685 ReductionBitWidth != It->second.first) {
15687 Builder.SetInsertPoint(ReductionRoot->getParent(),
15688 ReductionRoot->getIterator());
15689 Vec = Builder.CreateIntCast(
15693 It->second.second);
15700 <<
" gather sequences instructions.\n");
15707 Loop *L = LI->getLoopFor(
I->getParent());
15712 BasicBlock *PreHeader = L->getLoopPreheader();
15720 auto *OpI = dyn_cast<Instruction>(V);
15721 return OpI && L->contains(OpI);
15727 CSEBlocks.
insert(PreHeader);
15742 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
15743 "Different nodes should have different DFS numbers");
15744 return A->getDFSNumIn() <
B->getDFSNumIn();
15754 if (I1->getType() != I2->getType())
15759 return I1->isIdenticalTo(I2);
15760 if (SI1->isIdenticalTo(SI2))
15762 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
15763 if (SI1->getOperand(
I) != SI2->getOperand(
I))
15766 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
15770 unsigned LastUndefsCnt = 0;
15771 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
15777 NewMask[
I] != SM1[
I])
15780 NewMask[
I] = SM1[
I];
15784 return SM1.
size() - LastUndefsCnt > 1 &&
15788 SM1.
size() - LastUndefsCnt));
15794 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
15797 "Worklist not sorted properly!");
15804 !GatherShuffleExtractSeq.contains(&In))
15809 bool Replaced =
false;
15812 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
15813 DT->
dominates(V->getParent(), In.getParent())) {
15814 In.replaceAllUsesWith(V);
15817 if (!NewMask.
empty())
15818 SI->setShuffleMask(NewMask);
15823 GatherShuffleExtractSeq.contains(V) &&
15824 IsIdenticalOrLessDefined(V, &In, NewMask) &&
15825 DT->
dominates(In.getParent(), V->getParent())) {
15827 V->replaceAllUsesWith(&In);
15830 if (!NewMask.
empty())
15831 SI->setShuffleMask(NewMask);
15839 Visited.push_back(&In);
15844 GatherShuffleExtractSeq.clear();
15847BoUpSLP::ScheduleData *
15849 ScheduleData *Bundle =
nullptr;
15850 ScheduleData *PrevInBundle =
nullptr;
15851 for (
Value *V : VL) {
15854 ScheduleData *BundleMember = getScheduleData(V);
15856 "no ScheduleData for bundle member "
15857 "(maybe not in same basic block)");
15858 assert(BundleMember->isSchedulingEntity() &&
15859 "bundle member already part of other bundle");
15860 if (PrevInBundle) {
15861 PrevInBundle->NextInBundle = BundleMember;
15863 Bundle = BundleMember;
15867 BundleMember->FirstInBundle = Bundle;
15868 PrevInBundle = BundleMember;
15870 assert(Bundle &&
"Failed to find schedule bundle");
15876std::optional<BoUpSLP::ScheduleData *>
15878 const InstructionsState &S) {
15889 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
15890 ScheduleData *Bundle) {
15896 if (ScheduleEnd != OldScheduleEnd) {
15897 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
15898 if (ScheduleData *SD = getScheduleData(
I))
15899 SD->clearDependencies();
15904 <<
" in block " << BB->
getName() <<
"\n");
15905 calculateDependencies(Bundle,
true, SLP);
15910 initialFillReadyList(ReadyInsts);
15917 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
15918 !ReadyInsts.empty()) {
15919 ScheduleData *Picked = ReadyInsts.pop_back_val();
15920 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
15921 "must be ready to schedule");
15922 schedule(Picked, ReadyInsts);
15928 for (
Value *V : VL) {
15931 if (!extendSchedulingRegion(V, S)) {
15938 TryScheduleBundleImpl(
false,
nullptr);
15939 return std::nullopt;
15943 bool ReSchedule =
false;
15944 for (
Value *V : VL) {
15947 ScheduleData *BundleMember = getScheduleData(V);
15949 "no ScheduleData for bundle member (maybe not in same basic block)");
15953 ReadyInsts.remove(BundleMember);
15955 if (!BundleMember->IsScheduled)
15960 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
15961 <<
" was already scheduled\n");
15965 auto *Bundle = buildBundle(VL);
15966 TryScheduleBundleImpl(ReSchedule, Bundle);
15967 if (!Bundle->isReady()) {
15968 cancelScheduling(VL, S.OpValue);
15969 return std::nullopt;
15982 ScheduleData *Bundle = getScheduleData(OpValue);
15983 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
15984 assert(!Bundle->IsScheduled &&
15985 "Can't cancel bundle which is already scheduled");
15986 assert(Bundle->isSchedulingEntity() &&
15988 "tried to unbundle something which is not a bundle");
15991 if (Bundle->isReady())
15992 ReadyInsts.remove(Bundle);
15995 ScheduleData *BundleMember = Bundle;
15996 while (BundleMember) {
15997 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
15998 BundleMember->FirstInBundle = BundleMember;
15999 ScheduleData *
Next = BundleMember->NextInBundle;
16000 BundleMember->NextInBundle =
nullptr;
16001 BundleMember->TE =
nullptr;
16002 if (BundleMember->unscheduledDepsInBundle() == 0) {
16003 ReadyInsts.insert(BundleMember);
16005 BundleMember =
Next;
16009BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
16011 if (ChunkPos >= ChunkSize) {
16012 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
16015 return &(ScheduleDataChunks.back()[ChunkPos++]);
16018bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
16019 Value *V,
const InstructionsState &S) {
16021 assert(
I &&
"bundle member must be an instruction");
16024 "phi nodes/insertelements/extractelements/extractvalues don't need to "
16026 if (getScheduleData(
I))
16028 if (!ScheduleStart) {
16030 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
16032 ScheduleEnd =
I->getNextNode();
16033 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
16034 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
16042 ++ScheduleStart->getIterator().getReverse();
16048 return II->isAssumeLikeIntrinsic();
16051 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
16052 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
16053 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
16055 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
16056 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
16063 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
16064 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
16066 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
16067 assert(
I->getParent() == ScheduleStart->getParent() &&
16068 "Instruction is in wrong basic block.");
16069 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
16075 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
16076 "Expected to reach top of the basic block or instruction down the "
16078 assert(
I->getParent() == ScheduleEnd->getParent() &&
16079 "Instruction is in wrong basic block.");
16080 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
16082 ScheduleEnd =
I->getNextNode();
16083 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
16084 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
16088void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
16090 ScheduleData *PrevLoadStore,
16091 ScheduleData *NextLoadStore) {
16092 ScheduleData *CurrentLoadStore = PrevLoadStore;
16097 ScheduleData *SD = ScheduleDataMap.lookup(
I);
16099 SD = allocateScheduleDataChunks();
16100 ScheduleDataMap[
I] = SD;
16102 assert(!isInSchedulingRegion(SD) &&
16103 "new ScheduleData already in scheduling region");
16104 SD->init(SchedulingRegionID,
I);
16106 if (
I->mayReadOrWriteMemory() &&
16110 Intrinsic::pseudoprobe))) {
16112 if (CurrentLoadStore) {
16113 CurrentLoadStore->NextLoadStore = SD;
16115 FirstLoadStoreInRegion = SD;
16117 CurrentLoadStore = SD;
16122 RegionHasStackSave =
true;
16124 if (NextLoadStore) {
16125 if (CurrentLoadStore)
16126 CurrentLoadStore->NextLoadStore = NextLoadStore;
16128 LastLoadStoreInRegion = CurrentLoadStore;
16132void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
16133 bool InsertInReadyList,
16135 assert(SD->isSchedulingEntity());
16140 while (!WorkList.
empty()) {
16142 for (ScheduleData *BundleMember = SD; BundleMember;
16143 BundleMember = BundleMember->NextInBundle) {
16144 assert(isInSchedulingRegion(BundleMember));
16145 if (BundleMember->hasValidDependencies())
16150 BundleMember->Dependencies = 0;
16151 BundleMember->resetUnscheduledDeps();
16154 for (
User *U : BundleMember->Inst->users()) {
16156 BundleMember->Dependencies++;
16157 ScheduleData *DestBundle = UseSD->FirstInBundle;
16158 if (!DestBundle->IsScheduled)
16159 BundleMember->incrementUnscheduledDeps(1);
16160 if (!DestBundle->hasValidDependencies())
16166 auto *DepDest = getScheduleData(
I);
16167 assert(DepDest &&
"must be in schedule window");
16168 DepDest->ControlDependencies.push_back(BundleMember);
16169 BundleMember->Dependencies++;
16170 ScheduleData *DestBundle = DepDest->FirstInBundle;
16171 if (!DestBundle->IsScheduled)
16172 BundleMember->incrementUnscheduledDeps(1);
16173 if (!DestBundle->hasValidDependencies())
16181 for (
Instruction *
I = BundleMember->Inst->getNextNode();
16182 I != ScheduleEnd;
I =
I->getNextNode()) {
16187 MakeControlDependent(
I);
16195 if (RegionHasStackSave) {
16201 for (
Instruction *
I = BundleMember->Inst->getNextNode();
16202 I != ScheduleEnd;
I =
I->getNextNode()) {
16213 MakeControlDependent(
I);
16223 BundleMember->Inst->mayReadOrWriteMemory()) {
16224 for (
Instruction *
I = BundleMember->Inst->getNextNode();
16225 I != ScheduleEnd;
I =
I->getNextNode()) {
16231 MakeControlDependent(
I);
16238 ScheduleData *DepDest = BundleMember->NextLoadStore;
16243 "NextLoadStore list for non memory effecting bundle?");
16245 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
16246 unsigned NumAliased = 0;
16247 unsigned DistToSrc = 1;
16249 for (; DepDest; DepDest = DepDest->NextLoadStore) {
16250 assert(isInSchedulingRegion(DepDest));
16260 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
16262 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
16269 DepDest->MemoryDependencies.push_back(BundleMember);
16270 BundleMember->Dependencies++;
16271 ScheduleData *DestBundle = DepDest->FirstInBundle;
16272 if (!DestBundle->IsScheduled) {
16273 BundleMember->incrementUnscheduledDeps(1);
16275 if (!DestBundle->hasValidDependencies()) {
16298 if (InsertInReadyList && SD->isReady()) {
16299 ReadyInsts.insert(SD);
16306void BoUpSLP::BlockScheduling::resetSchedule() {
16308 "tried to reset schedule on block which has not been scheduled");
16310 if (ScheduleData *SD = getScheduleData(
I)) {
16311 assert(isInSchedulingRegion(SD) &&
16312 "ScheduleData not in scheduling region");
16313 SD->IsScheduled =
false;
16314 SD->resetUnscheduledDeps();
16317 ReadyInsts.clear();
16320void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
16321 if (!BS->ScheduleStart)
16324 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
16331 BS->resetSchedule();
16338 struct ScheduleDataCompare {
16339 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
16340 return SD2->SchedulingPriority < SD1->SchedulingPriority;
16343 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
16348 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
16349 I =
I->getNextNode()) {
16350 if (ScheduleData *SD = BS->getScheduleData(
I)) {
16351 TreeEntry *SDTE = getTreeEntry(SD->Inst);
16354 SD->isPartOfBundle() ==
16356 "scheduler and vectorizer bundle mismatch");
16357 SD->FirstInBundle->SchedulingPriority = Idx++;
16359 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
16360 BS->calculateDependencies(SD,
false,
this);
16363 BS->initialFillReadyList(ReadyInsts);
16365 Instruction *LastScheduledInst = BS->ScheduleEnd;
16368 while (!ReadyInsts.empty()) {
16369 ScheduleData *Picked = *ReadyInsts.begin();
16370 ReadyInsts.erase(ReadyInsts.begin());
16374 for (ScheduleData *BundleMember = Picked; BundleMember;
16375 BundleMember = BundleMember->NextInBundle) {
16379 LastScheduledInst = PickedInst;
16382 BS->schedule(Picked, ReadyInsts);
16386#ifdef EXPENSIVE_CHECKS
16390#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
16392 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
16393 ScheduleData *SD = BS->getScheduleData(
I);
16394 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
16395 assert(SD->IsScheduled &&
"must be scheduled at this point");
16400 BS->ScheduleStart =
nullptr;
16408 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
16413 auto E = InstrElementSize.
find(V);
16414 if (E != InstrElementSize.
end())
16431 Value *FirstNonBool =
nullptr;
16432 while (!Worklist.
empty()) {
16437 auto *Ty =
I->getType();
16440 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
16448 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
16456 for (
Use &U :
I->operands()) {
16458 if (Visited.
insert(J).second &&
16464 FirstNonBool = U.get();
16475 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
16477 Width =
DL->getTypeSizeInBits(V->getType());
16481 InstrElementSize[
I] = Width;
16486bool BoUpSLP::collectValuesToDemote(
16487 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
16489 unsigned &MaxDepthLevel,
bool &IsProfitableToDemote,
16490 bool IsTruncRoot)
const {
16495 unsigned OrigBitWidth =
16496 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
16505 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
16506 return !isKnownNonNegative(R, SimplifyQuery(*DL));
16508 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
16515 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
16521 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
16526 unsigned BitWidth2 =
16527 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
16528 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
16534 BitWidth1 = std::min(BitWidth1, BitWidth2);
16539 using namespace std::placeholders;
16540 auto FinalAnalysis = [&]() {
16541 if (!IsProfitableToDemote)
16544 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
16546 if (Res && E.isGather()) {
16550 for (
Value *V : E.Scalars) {
16554 UniqueBases.
insert(EE->getVectorOperand());
16556 const unsigned VF = E.Scalars.size();
16557 Type *OrigScalarTy = E.Scalars.front()->getType();
16558 if (UniqueBases.
size() <= 2 ||
16566 if (E.isGather() || !Visited.
insert(&E).second ||
16568 return all_of(V->users(), [&](User *U) {
16569 return isa<InsertElementInst>(U) && !getTreeEntry(U);
16572 return FinalAnalysis();
16575 return !all_of(V->users(), [=](User *U) {
16576 return getTreeEntry(U) ||
16577 (E.Idx == 0 && UserIgnoreList &&
16578 UserIgnoreList->contains(U)) ||
16579 (!isa<CmpInst>(U) && U->getType()->isSized() &&
16580 !U->getType()->isScalableTy() &&
16581 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
16582 }) && !IsPotentiallyTruncated(V,
BitWidth);
16587 bool &NeedToExit) {
16588 NeedToExit =
false;
16589 unsigned InitLevel = MaxDepthLevel;
16591 unsigned Level = InitLevel;
16592 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
16593 ToDemote, Visited, Level, IsProfitableToDemote,
16595 if (!IsProfitableToDemote)
16598 if (!FinalAnalysis())
16602 MaxDepthLevel = std::max(MaxDepthLevel, Level);
16606 auto AttemptCheckBitwidth =
16609 NeedToExit =
false;
16610 unsigned BestFailBitwidth = 0;
16612 if (Checker(
BitWidth, OrigBitWidth))
16614 if (BestFailBitwidth == 0 && FinalAnalysis())
16618 if (BestFailBitwidth == 0) {
16629 auto TryProcessInstruction =
16635 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
16640 if (E.UserTreeIndices.size() > 1 &&
16641 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
16644 bool NeedToExit =
false;
16645 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
16649 if (!ProcessOperands(
Operands, NeedToExit))
16658 return IsProfitableToDemote;
16660 switch (E.getOpcode()) {
16664 case Instruction::Trunc:
16665 if (IsProfitableToDemoteRoot)
16666 IsProfitableToDemote =
true;
16667 return TryProcessInstruction(
BitWidth);
16668 case Instruction::ZExt:
16669 case Instruction::SExt:
16670 IsProfitableToDemote =
true;
16671 return TryProcessInstruction(
BitWidth);
16675 case Instruction::Add:
16676 case Instruction::Sub:
16677 case Instruction::Mul:
16678 case Instruction::And:
16679 case Instruction::Or:
16680 case Instruction::Xor: {
16681 return TryProcessInstruction(
16682 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
16684 case Instruction::Shl: {
16689 auto *I = cast<Instruction>(V);
16690 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
16691 return AmtKnownBits.getMaxValue().ult(BitWidth);
16694 return TryProcessInstruction(
16695 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
16697 case Instruction::LShr: {
16701 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
16703 auto *I = cast<Instruction>(V);
16704 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
16705 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
16706 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
16707 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
16708 SimplifyQuery(*DL));
16711 return TryProcessInstruction(
16712 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
16715 case Instruction::AShr: {
16719 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
16721 auto *I = cast<Instruction>(V);
16722 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
16723 unsigned ShiftedBits = OrigBitWidth - BitWidth;
16724 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
16725 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
16729 return TryProcessInstruction(
16730 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
16733 case Instruction::UDiv:
16734 case Instruction::URem: {
16736 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
16739 auto *I = cast<Instruction>(V);
16740 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
16741 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
16742 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
16745 return TryProcessInstruction(
16746 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
16750 case Instruction::Select: {
16751 return TryProcessInstruction(
16752 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
16757 case Instruction::PHI: {
16758 const unsigned NumOps = E.getNumOperands();
16761 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
16763 return TryProcessInstruction(
BitWidth, Ops);
16766 case Instruction::Call: {
16771 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
16772 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
16776 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
16779 auto *I = cast<Instruction>(V);
16780 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
16781 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
16782 return MaskedValueIsZero(I->getOperand(0), Mask,
16783 SimplifyQuery(*DL)) &&
16784 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
16786 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
16787 "Expected min/max intrinsics only.");
16788 unsigned SignBits = OrigBitWidth -
BitWidth;
16794 return SignBits <= Op0SignBits &&
16795 ((SignBits != Op0SignBits &&
16799 SignBits <= Op1SignBits &&
16800 ((SignBits != Op1SignBits &&
16805 if (ID != Intrinsic::abs) {
16806 Operands.push_back(getOperandEntry(&E, 1));
16807 CallChecker = CompChecker;
16810 std::numeric_limits<InstructionCost::CostType>::max();
16812 unsigned VF = E.Scalars.size();
16820 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
16821 if (Cost < BestCost) {
16827 [[maybe_unused]]
bool NeedToExit;
16828 (void)AttemptCheckBitwidth(Checker, NeedToExit);
16838 return FinalAnalysis();
16845 bool IsStoreOrInsertElt =
16846 VectorizableTree.front()->getOpcode() == Instruction::Store ||
16847 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
16848 if ((IsStoreOrInsertElt || UserIgnoreList) &&
16849 ExtraBitWidthNodes.
size() <= 1 &&
16850 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
16851 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
16854 unsigned NodeIdx = 0;
16855 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
16859 if (VectorizableTree[NodeIdx]->
isGather() ||
16860 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.
empty()) ||
16861 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
16864 static_cast<int>(NodeIdx);
16870 bool IsTruncRoot =
false;
16871 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
16873 if (NodeIdx != 0 &&
16874 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
16875 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
16876 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
16877 IsTruncRoot =
true;
16879 IsProfitableToDemoteRoot =
true;
16884 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
16888 auto ComputeMaxBitWidth = [&](
const TreeEntry &
E,
bool IsTopRoot,
16889 bool IsProfitableToDemoteRoot,
unsigned Opcode,
16890 unsigned Limit,
bool IsTruncRoot,
16891 bool IsSignedCmp) ->
unsigned {
16895 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
16896 E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
16898 return V->hasOneUse() || isa<Constant>(V) ||
16899 (!V->hasNUsesOrMore(UsesLimit) &&
16900 none_of(V->users(), [&](User *U) {
16901 const TreeEntry *TE = getTreeEntry(U);
16902 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
16903 if (TE == UserTE || !TE)
16905 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
16907 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
16908 SelectInst>(UserTE->getMainOp()))
16910 unsigned UserTESz = DL->getTypeSizeInBits(
16911 UserTE->Scalars.front()->getType());
16912 auto It = MinBWs.find(TE);
16913 if (It != MinBWs.end() && It->second.first > UserTESz)
16915 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
16919 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
16920 auto It = MinBWs.find(UserTE);
16921 if (It != MinBWs.end())
16922 return It->second.first;
16923 unsigned MaxBitWidth =
16924 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
16925 MaxBitWidth =
bit_ceil(MaxBitWidth);
16926 if (MaxBitWidth < 8 && MaxBitWidth > 1)
16928 return MaxBitWidth;
16931 unsigned VF = E.getVectorFactor();
16932 Type *ScalarTy = E.Scalars.front()->getType();
16935 if (!TreeRootIT || !Opcode)
16939 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
16948 unsigned MaxBitWidth = 1u;
16956 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
16957 KnownBits Known = computeKnownBits(R, *DL);
16958 return Known.isNonNegative();
16963 for (
Value *Root : E.Scalars) {
16967 unsigned BitWidth1 = NumTypeBits - NumSignBits;
16983 if (!IsKnownPositive)
16987 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
16989 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
16992 if (MaxBitWidth < 8 && MaxBitWidth > 1)
16997 if (NumParts > 1 &&
17003 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
17004 Opcode == Instruction::SExt ||
17005 Opcode == Instruction::ZExt || NumParts > 1;
17010 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
17011 bool NeedToDemote = IsProfitableToDemote;
17013 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
17014 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
17016 (MaxDepthLevel <= Limit &&
17017 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
17018 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
17019 DL->getTypeSizeInBits(TreeRootIT) /
17026 MaxBitWidth =
bit_ceil(MaxBitWidth);
17028 return MaxBitWidth;
17035 if (UserIgnoreList &&
17037 for (
Value *V : *UserIgnoreList) {
17039 auto NumTypeBits =
DL->getTypeSizeInBits(V->getType());
17040 unsigned BitWidth1 = NumTypeBits - NumSignBits;
17043 unsigned BitWidth2 = BitWidth1;
17046 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
17048 ReductionBitWidth =
17049 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
17051 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
17052 ReductionBitWidth = 8;
17054 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
17056 bool IsTopRoot = NodeIdx == 0;
17057 while (NodeIdx < VectorizableTree.size() &&
17058 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
17059 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
17060 RootDemotes.push_back(NodeIdx);
17062 IsTruncRoot =
true;
17064 bool IsSignedCmp =
false;
17065 while (NodeIdx < VectorizableTree.size()) {
17067 unsigned Limit = 2;
17068 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
17070 ReductionBitWidth ==
17071 DL->getTypeSizeInBits(
17072 VectorizableTree.front()->Scalars.front()->getType()))
17074 unsigned MaxBitWidth = ComputeMaxBitWidth(
17075 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
17076 Limit, IsTruncRoot, IsSignedCmp);
17077 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
17078 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
17079 ReductionBitWidth =
bit_ceil(MaxBitWidth);
17080 else if (MaxBitWidth == 0)
17081 ReductionBitWidth = 0;
17084 for (
unsigned Idx : RootDemotes) {
17085 if (
all_of(VectorizableTree[Idx]->Scalars, [&](
Value *V) {
17087 DL->getTypeSizeInBits(V->getType()->getScalarType());
17088 if (OrigBitWidth > MaxBitWidth) {
17096 RootDemotes.clear();
17098 IsProfitableToDemoteRoot =
true;
17100 if (ExtraBitWidthNodes.
empty()) {
17101 NodeIdx = VectorizableTree.size();
17103 unsigned NewIdx = 0;
17105 NewIdx = *ExtraBitWidthNodes.
begin();
17106 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
17107 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
17110 NodeIdx < VectorizableTree.size() &&
17111 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
17112 [](
const EdgeInfo &EI) {
17113 return EI.EdgeIdx == 0 &&
17114 EI.UserTE->getOpcode() == Instruction::Trunc &&
17115 !EI.UserTE->isAltShuffle();
17118 NodeIdx < VectorizableTree.size() &&
17119 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
17120 [&](
const EdgeInfo &EI) {
17121 return EI.UserTE->getOpcode() == Instruction::ICmp &&
17123 auto *IC = dyn_cast<ICmpInst>(V);
17126 !isKnownNonNegative(IC->getOperand(0),
17127 SimplifyQuery(*DL)) ||
17128 !isKnownNonNegative(IC->getOperand(1),
17129 SimplifyQuery(*DL)));
17136 if (MaxBitWidth == 0 ||
17140 if (UserIgnoreList)
17147 for (
unsigned Idx : ToDemote) {
17148 TreeEntry *
TE = VectorizableTree[Idx].get();
17149 if (MinBWs.contains(TE))
17152 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17154 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
17195 DL = &
F.getDataLayout();
17205 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
17210 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
17213 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
17217 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
17226 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
17228 R.clearReductionData();
17229 collectSeedInstructions(BB);
17232 if (!Stores.empty()) {
17234 <<
" underlying objects.\n");
17235 Changed |= vectorizeStoreChains(R);
17239 Changed |= vectorizeChainsInBlock(BB, R);
17244 if (!GEPs.
empty()) {
17246 <<
" underlying objects.\n");
17247 Changed |= vectorizeGEPIndices(BB, R);
17252 R.optimizeGatherSequence();
17260 unsigned Idx,
unsigned MinVF,
17265 const unsigned Sz = R.getVectorElementSize(Chain[0]);
17266 unsigned VF = Chain.
size();
17276 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << VF <<
" stores at offset " << Idx
17280 for (
Value *V : Chain)
17289 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
17293 return !isa<ExtractElementInst>(V) &&
17294 (V->getNumUses() > Chain.size() ||
17295 any_of(V->users(), [&](User *U) {
17296 return !Stores.contains(U);
17299 (ValOps.
size() > Chain.size() / 2 && !S.getOpcode())) {
17300 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
17304 if (
R.isLoadCombineCandidate(Chain))
17306 R.buildTree(Chain);
17308 if (
R.isTreeTinyAndNotFullyVectorizable()) {
17309 if (
R.isGathered(Chain.front()) ||
17311 return std::nullopt;
17312 Size =
R.getCanonicalGraphSize();
17315 R.reorderTopToBottom();
17316 R.reorderBottomToTop();
17317 R.transformNodes();
17318 R.buildExternalUses();
17320 R.computeMinimumValueSizes();
17322 Size =
R.getCanonicalGraphSize();
17323 if (S.getOpcode() == Instruction::Load)
17327 LLVM_DEBUG(
dbgs() <<
"SLP: Found cost = " << Cost <<
" for VF=" << VF <<
"\n");
17329 LLVM_DEBUG(
dbgs() <<
"SLP: Decided to vectorize cost = " << Cost <<
"\n");
17331 using namespace ore;
17335 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost", Cost)
17336 <<
" and with tree size "
17337 <<
NV(
"TreeSize",
R.getTreeSize()));
17351 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
17352 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
17353 unsigned Size = First ? Val.first : Val.second;
17365 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
17366 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
17367 unsigned P = First ? Val.first : Val.second;
17370 return V + (P - Mean) * (P - Mean);
17373 return Dev * 81 / (Mean * Mean) == 0;
17376bool SLPVectorizerPass::vectorizeStores(
17378 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
17385 struct StoreDistCompare {
17386 bool operator()(
const std::pair<unsigned, int> &Op1,
17387 const std::pair<unsigned, int> &Op2)
const {
17388 return Op1.second < Op2.second;
17393 using StoreIndexToDistSet =
17394 std::set<std::pair<unsigned, int>, StoreDistCompare>;
17395 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
17400 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
17402 PrevDist =
Data.second;
17403 if (Idx !=
Set.size() - 1)
17408 Operands.push_back(Stores[DataVar.first]);
17409 PrevDist = DataVar.second;
17414 .
insert({Operands.front(),
17415 cast<StoreInst>(Operands.front())->getValueOperand(),
17417 cast<StoreInst>(Operands.back())->getValueOperand(),
17422 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
17423 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
17427 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
17428 unsigned MaxRegVF = MaxVF;
17430 Type *StoreTy =
Store->getValueOperand()->getType();
17431 Type *ValueTy = StoreTy;
17433 ValueTy = Trunc->getSrcTy();
17434 if (ValueTy == StoreTy &&
17435 R.getVectorElementSize(
Store->getValueOperand()) <= EltSize)
17437 unsigned MinVF = std::max<unsigned>(
17439 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
17442 if (MaxVF < MinVF) {
17443 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
17445 <<
"MinVF (" << MinVF <<
")\n");
17449 unsigned NonPowerOf2VF = 0;
17455 std::clamp<unsigned>(
Operands.size(), MaxVF, MaxRegVF);
17457 NonPowerOf2VF = CandVF;
17462 unsigned Size = MinVF;
17464 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
17468 unsigned Repeat = 0;
17469 constexpr unsigned MaxAttempts = 4;
17471 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
17472 P.first =
P.second = 1;
17475 auto IsNotVectorized = [](
bool First,
17476 const std::pair<unsigned, unsigned> &
P) {
17477 return First ?
P.first > 0 :
P.second > 0;
17479 auto IsVectorized = [](
bool First,
17480 const std::pair<unsigned, unsigned> &
P) {
17481 return First ?
P.first == 0 :
P.second == 0;
17483 auto VFIsProfitable = [](
bool First,
unsigned Size,
17484 const std::pair<unsigned, unsigned> &
P) {
17487 auto FirstSizeSame = [](
unsigned Size,
17488 const std::pair<unsigned, unsigned> &
P) {
17489 return Size ==
P.first;
17493 bool RepeatChanged =
false;
17494 bool AnyProfitableGraph =
false;
17495 for (
unsigned Size : CandidateVFs) {
17496 AnyProfitableGraph =
false;
17497 unsigned StartIdx = std::distance(
17498 RangeSizes.begin(),
17499 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
17500 std::placeholders::_1)));
17501 while (StartIdx <
End) {
17503 std::distance(RangeSizes.begin(),
17504 find_if(RangeSizes.drop_front(StartIdx),
17505 std::bind(IsVectorized,
Size >= MaxRegVF,
17506 std::placeholders::_1)));
17507 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
17508 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
17510 Size >= MaxRegVF)) {
17518 ->getValueOperand()
17521 ->getValueOperand()
17524 "Expected all operands of same type.");
17525 if (!NonSchedulable.empty()) {
17526 auto [NonSchedSizeMax, NonSchedSizeMin] =
17527 NonSchedulable.lookup(Slice.front());
17528 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
17529 Cnt += NonSchedSizeMax;
17534 std::optional<bool> Res =
17535 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
17538 .try_emplace(Slice.front(), std::make_pair(
Size,
Size))
17539 .first->getSecond()
17544 VectorizedStores.
insert(Slice.begin(), Slice.end());
17547 AnyProfitableGraph = RepeatChanged =
Changed =
true;
17551 [](std::pair<unsigned, unsigned> &
P) {
17552 P.first = P.second = 0;
17554 if (Cnt < StartIdx + MinVF) {
17555 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
17556 [](std::pair<unsigned, unsigned> &
P) {
17557 P.first = P.second = 0;
17559 StartIdx = Cnt +
Size;
17561 if (Cnt > Sz -
Size - MinVF) {
17563 [](std::pair<unsigned, unsigned> &
P) {
17564 P.first = P.second = 0;
17573 if (
Size > 2 && Res &&
17575 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
17576 std::placeholders::_1))) {
17582 if (
Size > MaxRegVF && TreeSize > 1 &&
17584 std::bind(FirstSizeSame, TreeSize,
17585 std::placeholders::_1))) {
17587 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
17593 [&](std::pair<unsigned, unsigned> &
P) {
17594 if (Size >= MaxRegVF)
17595 P.second = std::max(P.second, TreeSize);
17597 P.first = std::max(P.first, TreeSize);
17600 AnyProfitableGraph =
true;
17602 if (StartIdx >=
End)
17604 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
17605 AnyProfitableGraph =
true;
17606 StartIdx = std::distance(
17607 RangeSizes.begin(),
17608 find_if(RangeSizes.drop_front(Sz),
17609 std::bind(IsNotVectorized,
Size >= MaxRegVF,
17610 std::placeholders::_1)));
17612 if (!AnyProfitableGraph &&
Size >= MaxRegVF)
17616 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
17617 return P.first == 0 &&
P.second == 0;
17621 if (Repeat >= MaxAttempts ||
17622 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
17624 constexpr unsigned StoresLimit = 64;
17625 const unsigned MaxTotalNum =
bit_floor(std::min<unsigned>(
17627 static_cast<unsigned>(
17630 RangeSizes.begin(),
17631 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
17632 std::placeholders::_1))) +
17635 if (VF > MaxTotalNum || VF >= StoresLimit)
17637 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
17639 P.first = std::max(
P.second,
P.first);
17643 CandidateVFs.clear();
17644 CandidateVFs.push_back(VF);
17690 auto FillStoresSet = [&](
unsigned Idx,
StoreInst *
SI) {
17691 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
17693 Stores[
Set.first]->getValueOperand()->getType(),
17694 Stores[
Set.first]->getPointerOperand(),
17695 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *DL, *SE,
17699 auto It =
Set.second.find(std::make_pair(Idx, *Diff));
17700 if (It ==
Set.second.end()) {
17701 Set.second.emplace(Idx, *Diff);
17705 TryToVectorize(
Set.second);
17706 StoreIndexToDistSet PrevSet;
17707 PrevSet.swap(
Set.second);
17709 Set.second.emplace(Idx, 0);
17712 unsigned StartIdx = It->first + 1;
17717 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
17719 if (Pair.first <= It->first ||
17720 VectorizedStores.
contains(Stores[Pair.first]))
17722 unsigned BI = Pair.first - StartIdx;
17723 UsedStores.set(BI);
17724 Dists[BI] = Pair.second - It->second;
17726 for (
unsigned I = StartIdx;
I < Idx; ++
I) {
17727 unsigned BI =
I - StartIdx;
17728 if (UsedStores.test(BI))
17729 Set.second.emplace(
I, Dists[BI]);
17733 auto &Res = SortedStores.emplace_back();
17735 Res.second.emplace(Idx, 0);
17737 Type *PrevValTy =
nullptr;
17739 if (
R.isDeleted(SI))
17742 PrevValTy =
SI->getValueOperand()->getType();
17744 if (PrevValTy !=
SI->getValueOperand()->getType()) {
17745 for (
auto &Set : SortedStores)
17746 TryToVectorize(
Set.second);
17747 SortedStores.clear();
17748 PrevValTy =
SI->getValueOperand()->getType();
17750 FillStoresSet(
I, SI);
17754 for (
auto &Set : SortedStores)
17755 TryToVectorize(
Set.second);
17760void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
17772 if (!
SI->isSimple())
17783 if (
GEP->getNumIndices() != 1)
17785 Value *Idx =
GEP->idx_begin()->get();
17790 if (
GEP->getType()->isVectorTy())
17802 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
17803 << VL.
size() <<
".\n");
17808 if (!S.getOpcode())
17814 for (
Value *V : VL) {
17815 Type *Ty = V->getType();
17819 R.getORE()->emit([&]() {
17820 std::string TypeStr;
17824 <<
"Cannot SLP vectorize list: type "
17825 << TypeStr +
" is unsupported by vectorizer";
17831 unsigned Sz =
R.getVectorElementSize(I0);
17832 unsigned MinVF =
R.getMinVF(Sz);
17833 unsigned MaxVF = std::max<unsigned>(
llvm::bit_floor(VL.size()), MinVF);
17834 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
17836 R.getORE()->emit([&]() {
17838 <<
"Cannot SLP vectorize list: vectorization factor "
17839 <<
"less than 2 is not supported";
17845 bool CandidateFound =
false;
17849 unsigned NextInst = 0, MaxInst = VL.size();
17850 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
17857 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
17858 unsigned ActualVF = std::min(MaxInst -
I, VF);
17863 if (MaxVFOnly && ActualVF < MaxVF)
17865 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
17872 return I &&
R.isDeleted(
I);
17876 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
17880 if (
R.isTreeTinyAndNotFullyVectorizable())
17882 R.reorderTopToBottom();
17883 R.reorderBottomToTop(
17885 !
R.doesRootHaveInTreeUses());
17886 R.transformNodes();
17887 R.buildExternalUses();
17889 R.computeMinimumValueSizes();
17891 CandidateFound =
true;
17892 MinCost = std::min(MinCost, Cost);
17895 <<
" for VF=" << ActualVF <<
"\n");
17897 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing list at cost:" << Cost <<
".\n");
17900 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost", Cost)
17901 <<
" and with tree size "
17902 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
17913 if (!
Changed && CandidateFound) {
17914 R.getORE()->emit([&]() {
17916 <<
"List vectorization was possible but not beneficial with cost "
17917 <<
ore::NV(
"Cost", MinCost) <<
" >= "
17921 R.getORE()->emit([&]() {
17923 <<
"Cannot SLP vectorize list: vectorization was impossible"
17924 <<
" with available vectorization factors";
17942 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P)
17952 if (
A &&
B &&
B->hasOneUse()) {
17955 if (B0 && B0->getParent() ==
P)
17957 if (B1 && B1->getParent() ==
P)
17961 if (
B &&
A &&
A->hasOneUse()) {
17964 if (A0 && A0->getParent() ==
P)
17966 if (A1 && A1->getParent() ==
P)
17970 if (Candidates.
size() == 1)
17971 return tryToVectorizeList({Op0, Op1},
R);
17974 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
17975 if (!BestCandidate)
17977 return tryToVectorizeList(
17978 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
18009class HorizontalReduction {
18012 ReductionOpsListType ReductionOps;
18022 bool IsSupportedHorRdxIdentityOp =
false;
18039 if (Kind == RecurKind::None)
18047 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
18051 return I->getFastMathFlags().noNaNs();
18054 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
18057 return I->isAssociative();
18066 return I->getOperand(2);
18067 return I->getOperand(
Index);
18075 case RecurKind::Or:
18078 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS,
Name);
18081 case RecurKind::And:
18084 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(),
Name);
18087 case RecurKind::Add:
18088 case RecurKind::Mul:
18089 case RecurKind::Xor:
18090 case RecurKind::FAdd:
18091 case RecurKind::FMul:
18094 case RecurKind::FMax:
18095 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
18096 case RecurKind::FMin:
18097 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
18098 case RecurKind::FMaximum:
18099 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
18100 case RecurKind::FMinimum:
18101 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
18102 case RecurKind::SMax:
18105 return Builder.CreateSelect(Cmp, LHS, RHS,
Name);
18107 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
18108 case RecurKind::SMin:
18111 return Builder.CreateSelect(Cmp, LHS, RHS,
Name);
18113 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
18114 case RecurKind::UMax:
18117 return Builder.CreateSelect(Cmp, LHS, RHS,
Name);
18119 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
18120 case RecurKind::UMin:
18123 return Builder.CreateSelect(Cmp, LHS, RHS,
Name);
18125 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
18135 const ReductionOpsListType &ReductionOps) {
18136 bool UseSelect = ReductionOps.size() == 2 ||
18138 (ReductionOps.size() == 1 &&
18140 assert((!UseSelect || ReductionOps.size() != 2 ||
18142 "Expected cmp + select pairs for reduction");
18143 Value *
Op = createOp(Builder, RdxKind, LHS, RHS,
Name, UseSelect);
18161 return RecurKind::None;
18163 return RecurKind::Add;
18165 return RecurKind::Mul;
18168 return RecurKind::And;
18171 return RecurKind::Or;
18173 return RecurKind::Xor;
18175 return RecurKind::FAdd;
18177 return RecurKind::FMul;
18180 return RecurKind::FMax;
18182 return RecurKind::FMin;
18185 return RecurKind::FMaximum;
18187 return RecurKind::FMinimum;
18193 return RecurKind::SMax;
18195 return RecurKind::SMin;
18197 return RecurKind::UMax;
18199 return RecurKind::UMin;
18225 return RecurKind::None;
18229 return RecurKind::None;
18232 return RecurKind::None;
18236 return RecurKind::None;
18241 return RecurKind::None;
18244 return RecurKind::SMax;
18247 return RecurKind::SMin;
18250 return RecurKind::UMax;
18253 return RecurKind::UMin;
18256 return RecurKind::None;
18260 static unsigned getFirstOperandIndex(
Instruction *
I) {
18261 return isCmpSelMinMax(
I) ? 1 : 0;
18267 return isCmpSelMinMax(
I) ? 3 : 2;
18273 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
18276 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
18278 return I->getParent() == BB;
18282 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
18283 if (IsCmpSelMinMax) {
18287 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
18288 return I->hasNUses(2);
18297 if (isCmpSelMinMax(
I))
18298 ReductionOps.assign(2, ReductionOpsType());
18300 ReductionOps.assign(1, ReductionOpsType());
18305 if (isCmpSelMinMax(
I)) {
18307 ReductionOps[1].emplace_back(
I);
18309 ReductionOps[0].emplace_back(
I);
18314 int Sz =
Data.size();
18321 HorizontalReduction() =
default;
18327 RdxKind = HorizontalReduction::getRdxKind(Root);
18328 if (!isVectorizable(RdxKind, Root))
18340 if (!Sel->getCondition()->hasOneUse())
18343 ReductionRoot = Root;
18348 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
18350 1, std::make_pair(Root, 0));
18359 getNumberOfOperands(TreeN)))) {
18360 Value *EdgeVal = getRdxOperand(TreeN,
I);
18361 ReducedValsToOps[EdgeVal].push_back(TreeN);
18369 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
18370 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
18371 !isVectorizable(RdxKind, EdgeInst) ||
18372 (
R.isAnalyzedReductionRoot(EdgeInst) &&
18374 PossibleReducedVals.push_back(EdgeVal);
18377 ReductionOps.push_back(EdgeInst);
18388 PossibleReducedVals;
18389 initReductionOps(Root);
18393 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
18397 auto LIt = LoadsMap.
find(
Ptr);
18398 if (LIt != LoadsMap.
end()) {
18399 for (
LoadInst *RLI : LIt->second) {
18405 for (
LoadInst *RLI : LIt->second) {
18412 if (LIt->second.size() > 2) {
18414 hash_value(LIt->second.back()->getPointerOperand());
18419 LoadKeyUsed.
insert(Key);
18424 while (!Worklist.empty()) {
18425 auto [TreeN, Level] = Worklist.pop_back_val();
18428 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
18429 addReductionOps(TreeN);
18432 for (
Value *V : PossibleRedVals) {
18436 ++PossibleReducedVals[
Key][Idx]
18437 .
insert(std::make_pair(V, 0))
18441 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
18443 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
18446 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
18447 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
18449 for (
auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
18452 auto RedValsVect = It->second.takeVector();
18454 for (
const std::pair<Value *, unsigned> &
Data : RedValsVect)
18455 PossibleRedValsVect.
back().append(
Data.second,
Data.first);
18457 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
18458 return P1.size() > P2.size();
18463 (!isGoodForReduction(
Data) &&
18471 NewIdx = ReducedVals.
size();
18489 constexpr unsigned RegMaxNumber = 4;
18490 constexpr unsigned RedValsMaxNumber = 128;
18494 if (
unsigned NumReducedVals = std::accumulate(
18495 ReducedVals.
begin(), ReducedVals.
end(), 0,
18497 if (!isGoodForReduction(Vals))
18499 return Num + Vals.size();
18501 NumReducedVals < ReductionLimit &&
18505 for (ReductionOpsType &RdxOps : ReductionOps)
18506 for (
Value *RdxOp : RdxOps)
18518 ReducedVals.
front().size());
18522 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
18524 "Expected min/max reduction to have select root instruction");
18527 "Expected min/max reduction to have compare condition");
18532 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
18533 if (VectorizedTree) {
18535 Builder.SetCurrentDebugLocation(
18540 auto It = ReducedValsToOps.
find(Res);
18541 if (It != ReducedValsToOps.
end() &&
18547 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
18553 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
18554 return isBoolLogicOp(cast<Instruction>(V));
18557 ReductionOps.front().size());
18558 for (ReductionOpsType &RdxOps : ReductionOps)
18559 for (
Value *RdxOp : RdxOps) {
18562 IgnoreList.insert(RdxOp);
18567 for (
Value *U : IgnoreList)
18569 RdxFMF &= FPMO->getFastMathFlags();
18575 for (
Value *V : Candidates)
18576 TrackedVals.try_emplace(V, V);
18579 Value *V) ->
unsigned & {
18580 auto *It = MV.
find(V);
18581 assert(It != MV.
end() &&
"Unable to find given key.");
18589 Value *VectorizedTree =
nullptr;
18590 bool CheckForReusedReductionOps =
false;
18595 for (
unsigned I = 0, E = ReducedVals.
size();
I < E; ++
I) {
18597 InstructionsState S = States[
I];
18601 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
18602 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
18609 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
18610 (S.getOpcode() && !Inst))
18613 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
18615 bool ShuffledExtracts =
false;
18617 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
18620 for (
Value *RV : ReducedVals[
I + 1]) {
18621 Value *RdxVal = TrackedVals.at(RV);
18628 CommonCandidates.push_back(RdxVal);
18629 TrackedToOrig.try_emplace(RdxVal, RV);
18634 Candidates.
swap(CommonCandidates);
18635 ShuffledExtracts =
true;
18642 Value *OrigV = TrackedToOrig.at(Candidates.
front());
18643 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
18645 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
18646 Value *OrigV = TrackedToOrig.at(VC);
18647 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
18649 V.analyzedReductionRoot(ResI);
18651 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
18655 unsigned NumReducedVals = Candidates.
size();
18656 if (NumReducedVals < ReductionLimit &&
18657 (NumReducedVals < 2 || !
isSplat(Candidates)))
18662 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
18663 RdxKind != RecurKind::FMul &&
18664 RdxKind != RecurKind::FMulAdd;
18667 if (IsSupportedHorRdxIdentityOp)
18668 for (
Value *V : Candidates) {
18669 Value *OrigV = TrackedToOrig.at(V);
18670 ++SameValuesCounter.
try_emplace(OrigV).first->second;
18682 bool SameScaleFactor =
false;
18683 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
18684 SameValuesCounter.
size() != Candidates.size();
18685 if (OptReusedScalars) {
18687 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
18688 RdxKind == RecurKind::Xor) &&
18690 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
18691 return P.second == SameValuesCounter.
front().second;
18693 Candidates.resize(SameValuesCounter.
size());
18694 transform(SameValuesCounter, Candidates.begin(),
18695 [&](
const auto &
P) { return TrackedVals.at(P.first); });
18696 NumReducedVals = Candidates.size();
18698 if (NumReducedVals == 1) {
18699 Value *OrigV = TrackedToOrig.at(Candidates.front());
18700 unsigned Cnt = At(SameValuesCounter, OrigV);
18702 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
18703 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
18704 VectorizedVals.try_emplace(OrigV, Cnt);
18709 unsigned MaxVecRegSize = V.getMaxVecRegSize();
18710 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
18711 const unsigned MaxElts = std::clamp<unsigned>(
18713 RegMaxNumber * RedValsMaxNumber);
18715 unsigned ReduxWidth = NumReducedVals;
18718 ReduxWidth = std::min(ReduxWidth, MaxElts);
18720 unsigned Start = 0;
18721 unsigned Pos = Start;
18723 unsigned PrevReduxWidth = ReduxWidth;
18724 bool CheckForReusedReductionOpsLocal =
false;
18725 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
18726 &CheckForReusedReductionOpsLocal,
18727 &PrevReduxWidth, &V,
18728 &IgnoreList](
bool IgnoreVL =
false) {
18729 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
18730 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
18733 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
18736 if (Pos < NumReducedVals - ReduxWidth + 1)
18737 return IsAnyRedOpGathered;
18739 ReduxWidth =
bit_ceil(ReduxWidth) / 2;
18740 return IsAnyRedOpGathered;
18742 bool AnyVectorized =
false;
18743 while (Pos < NumReducedVals - ReduxWidth + 1 &&
18744 ReduxWidth >= ReductionLimit) {
18747 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
18749 CheckForReusedReductionOps =
true;
18752 PrevReduxWidth = ReduxWidth;
18755 if (V.areAnalyzedReductionVals(VL)) {
18756 (void)AdjustReducedVals(
true);
18765 return V.isDeleted(RedValI);
18768 V.buildTree(VL, IgnoreList);
18769 if (V.isTreeTinyAndNotFullyVectorizable(
true)) {
18770 if (!AdjustReducedVals())
18771 V.analyzedReductionVals(VL);
18774 if (V.isLoadCombineReductionCandidate(RdxKind)) {
18775 if (!AdjustReducedVals())
18776 V.analyzedReductionVals(VL);
18779 V.reorderTopToBottom();
18781 V.reorderBottomToTop(
true);
18788 LocalExternallyUsedValues[ReductionRoot];
18789 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
18790 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
18792 for (
Value *V : ReducedVals[Cnt])
18794 LocalExternallyUsedValues[TrackedVals[V]];
18796 if (!IsSupportedHorRdxIdentityOp) {
18799 "Reused values counter map is not empty");
18800 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
18801 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
18803 Value *V = Candidates[Cnt];
18804 Value *OrigV = TrackedToOrig.at(V);
18805 ++SameValuesCounter.
try_emplace(OrigV).first->second;
18811 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
18812 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
18814 Value *RdxVal = Candidates[Cnt];
18815 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
18816 RdxVal = It->second;
18817 if (!Visited.
insert(RdxVal).second)
18821 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
18822 LocalExternallyUsedValues[RdxVal];
18825 Value *OrigV = TrackedToOrig.at(RdxVal);
18827 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
18828 if (NumOps != ReducedValsToOps.
at(OrigV).size())
18829 LocalExternallyUsedValues[RdxVal];
18832 if (!IsSupportedHorRdxIdentityOp)
18833 SameValuesCounter.
clear();
18834 for (
Value *RdxVal : VL)
18835 if (RequiredExtract.
contains(RdxVal))
18836 LocalExternallyUsedValues[RdxVal];
18837 V.transformNodes();
18838 V.buildExternalUses(LocalExternallyUsedValues);
18840 V.computeMinimumValueSizes();
18845 getReductionCost(
TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
18848 <<
" for reduction\n");
18849 if (!Cost.isValid())
18852 V.getORE()->emit([&]() {
18854 ReducedValsToOps.
at(VL[0]).front())
18855 <<
"Vectorizing horizontal reduction is possible "
18856 <<
"but not beneficial with cost " <<
ore::NV(
"Cost", Cost)
18857 <<
" and threshold "
18860 if (!AdjustReducedVals())
18861 V.analyzedReductionVals(VL);
18865 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
18866 << Cost <<
". (HorRdx)\n");
18867 V.getORE()->emit([&]() {
18869 ReducedValsToOps.
at(VL[0]).front())
18870 <<
"Vectorized horizontal reduction with cost "
18871 <<
ore::NV(
"Cost", Cost) <<
" and with tree size "
18872 <<
ore::NV(
"TreeSize", V.getTreeSize());
18875 Builder.setFastMathFlags(RdxFMF);
18881 if (IsCmpSelMinMax)
18882 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
18885 Value *VectorizedRoot =
18886 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
18889 for (
Value *RdxVal : Candidates) {
18890 Value *OrigVal = TrackedToOrig.at(RdxVal);
18891 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
18892 if (TransformedRdxVal != RdxVal)
18893 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
18896 Builder.SetInsertPoint(InsertPt);
18901 if ((isBoolLogicOp(RdxRootInst) ||
18902 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
18904 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
18907 if (OptReusedScalars && !SameScaleFactor) {
18908 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
18909 SameValuesCounter, TrackedToOrig);
18912 Value *ReducedSubTree;
18913 Type *ScalarTy = VL.front()->getType();
18934 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
18935 ReducedSubTree = Builder.CreateInsertElement(
18936 ReducedSubTree, emitReduction(Lane, Builder,
TTI),
I);
18939 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI);
18941 if (ReducedSubTree->
getType() != VL.front()->getType()) {
18942 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
18943 "Expected different reduction type.");
18945 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
18946 V.isSignedMinBitwidthRootNode());
18952 if (OptReusedScalars && SameScaleFactor)
18953 ReducedSubTree = emitScaleForReusedOps(
18954 ReducedSubTree, Builder, SameValuesCounter.
front().second);
18956 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
18958 for (
Value *RdxVal : VL) {
18959 Value *OrigV = TrackedToOrig.at(RdxVal);
18960 if (IsSupportedHorRdxIdentityOp) {
18961 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
18964 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
18965 if (!V.isVectorized(RdxVal))
18966 RequiredExtract.
insert(RdxVal);
18971 AnyVectorized =
true;
18973 if (OptReusedScalars && !AnyVectorized) {
18974 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
18975 Value *RdxVal = TrackedVals.at(
P.first);
18976 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
18977 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
18978 VectorizedVals.try_emplace(
P.first,
P.second);
18983 if (VectorizedTree) {
19004 if (!AnyBoolLogicOp)
19006 if (isBoolLogicOp(RedOp1) &&
19007 ((!InitStep && LHS == VectorizedTree) ||
19010 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
19011 getRdxOperand(RedOp2, 0) == RHS ||
19016 if (LHS != VectorizedTree)
19017 LHS = Builder.CreateFreeze(LHS);
19027 unsigned Sz = InstVals.
size();
19030 for (
unsigned I = 0, E = (Sz / 2) * 2;
I <
E;
I += 2) {
19032 Builder.SetCurrentDebugLocation(RedOp->
getDebugLoc());
19033 Value *RdxVal1 = InstVals[
I].second;
19034 Value *StableRdxVal1 = RdxVal1;
19035 auto It1 = TrackedVals.find(RdxVal1);
19036 if (It1 != TrackedVals.end())
19037 StableRdxVal1 = It1->second;
19038 Value *RdxVal2 = InstVals[
I + 1].second;
19039 Value *StableRdxVal2 = RdxVal2;
19040 auto It2 = TrackedVals.find(RdxVal2);
19041 if (It2 != TrackedVals.end())
19042 StableRdxVal2 = It2->second;
19046 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
19048 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
19049 StableRdxVal2,
"op.rdx", ReductionOps);
19050 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
19053 ExtraReds[Sz / 2] = InstVals.
back();
19061 for (
Value *RdxVal : Candidates) {
19062 if (!Visited.
insert(RdxVal).second)
19064 unsigned NumOps = VectorizedVals.lookup(RdxVal);
19066 ArrayRef(ReducedValsToOps.
at(RdxVal)).drop_back(NumOps))
19071 bool InitStep =
true;
19072 while (ExtraReductions.
size() > 1) {
19074 FinalGen(ExtraReductions, InitStep);
19075 ExtraReductions.
swap(NewReds);
19078 VectorizedTree = ExtraReductions.
front().second;
19080 ReductionRoot->replaceAllUsesWith(VectorizedTree);
19089 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
19096 for (
auto *U :
Ignore->users()) {
19098 "All users must be either in the reduction ops list.");
19101 if (!
Ignore->use_empty()) {
19103 Ignore->replaceAllUsesWith(
P);
19106 V.removeInstructionsAndOperands(RdxOps);
19108 }
else if (!CheckForReusedReductionOps) {
19109 for (ReductionOpsType &RdxOps : ReductionOps)
19110 for (
Value *RdxOp : RdxOps)
19113 return VectorizedTree;
19120 bool IsCmpSelMinMax,
unsigned ReduxWidth,
19123 Type *ScalarTy = ReducedVals.
front()->getType();
19132 int Cnt = ReducedVals.
size();
19133 for (
Value *RdxVal : ReducedVals) {
19138 Cost += GenCostFn();
19144 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
19152 Cost += ScalarCost;
19154 Cost += GenCostFn();
19159 case RecurKind::Add:
19160 case RecurKind::Mul:
19161 case RecurKind::Or:
19162 case RecurKind::And:
19163 case RecurKind::Xor:
19164 case RecurKind::FAdd:
19165 case RecurKind::FMul: {
19186 ScalarCost = EvaluateScalarCost([&]() {
19191 case RecurKind::FMax:
19192 case RecurKind::FMin:
19193 case RecurKind::FMaximum:
19194 case RecurKind::FMinimum:
19195 case RecurKind::SMax:
19196 case RecurKind::SMin:
19197 case RecurKind::UMax:
19198 case RecurKind::UMin: {
19202 ScalarCost = EvaluateScalarCost([&]() {
19212 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
19214 <<
" (It is a splitting reduction)\n");
19215 return VectorCost - ScalarCost;
19221 assert(VectorizedValue &&
"Need to have a vectorized tree node");
19222 assert(RdxKind != RecurKind::FMulAdd &&
19223 "A call to the llvm.fmuladd intrinsic is not handled yet");
19225 ++NumVectorInstructions;
19232 assert(IsSupportedHorRdxIdentityOp &&
19233 "The optimization of matched scalar identity horizontal reductions "
19234 "must be supported.");
19236 return VectorizedValue;
19238 case RecurKind::Add: {
19240 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
19242 << VectorizedValue <<
". (HorRdx)\n");
19243 return Builder.CreateMul(VectorizedValue, Scale);
19245 case RecurKind::Xor: {
19247 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
19248 <<
". (HorRdx)\n");
19251 return VectorizedValue;
19253 case RecurKind::FAdd: {
19255 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
19257 << VectorizedValue <<
". (HorRdx)\n");
19258 return Builder.CreateFMul(VectorizedValue, Scale);
19260 case RecurKind::And:
19261 case RecurKind::Or:
19262 case RecurKind::SMax:
19263 case RecurKind::SMin:
19264 case RecurKind::UMax:
19265 case RecurKind::UMin:
19266 case RecurKind::FMax:
19267 case RecurKind::FMin:
19268 case RecurKind::FMaximum:
19269 case RecurKind::FMinimum:
19271 return VectorizedValue;
19272 case RecurKind::Mul:
19273 case RecurKind::FMul:
19274 case RecurKind::FMulAdd:
19275 case RecurKind::IAnyOf:
19276 case RecurKind::FAnyOf:
19277 case RecurKind::None:
19289 assert(IsSupportedHorRdxIdentityOp &&
19290 "The optimization of matched scalar identity horizontal reductions "
19291 "must be supported.");
19294 if (VTy->getElementType() != VL.
front()->getType()) {
19295 VectorizedValue = Builder.CreateIntCast(
19298 R.isSignedMinBitwidthRootNode());
19301 case RecurKind::Add: {
19304 for (
Value *V : VL) {
19305 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
19306 Vals.
push_back(ConstantInt::get(V->getType(), Cnt,
false));
19310 << VectorizedValue <<
". (HorRdx)\n");
19311 return Builder.CreateMul(VectorizedValue, Scale);
19313 case RecurKind::And:
19314 case RecurKind::Or:
19317 <<
". (HorRdx)\n");
19318 return VectorizedValue;
19319 case RecurKind::SMax:
19320 case RecurKind::SMin:
19321 case RecurKind::UMax:
19322 case RecurKind::UMin:
19323 case RecurKind::FMax:
19324 case RecurKind::FMin:
19325 case RecurKind::FMaximum:
19326 case RecurKind::FMinimum:
19329 <<
". (HorRdx)\n");
19330 return VectorizedValue;
19331 case RecurKind::Xor: {
19339 std::iota(
Mask.begin(),
Mask.end(), 0);
19340 bool NeedShuffle =
false;
19341 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
19343 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
19344 if (Cnt % 2 == 0) {
19346 NeedShuffle =
true;
19352 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
19354 VectorizedValue = Builder.CreateShuffleVector(
19356 ConstantVector::getNullValue(VectorizedValue->
getType()), Mask);
19357 return VectorizedValue;
19359 case RecurKind::FAdd: {
19362 for (
Value *V : VL) {
19363 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
19364 Vals.
push_back(ConstantFP::get(V->getType(), Cnt));
19367 return Builder.CreateFMul(VectorizedValue, Scale);
19369 case RecurKind::Mul:
19370 case RecurKind::FMul:
19371 case RecurKind::FMulAdd:
19372 case RecurKind::IAnyOf:
19373 case RecurKind::FAnyOf:
19374 case RecurKind::None:
19384 return HorizontalReduction::getRdxKind(V);
19390 unsigned AggregateSize = 1;
19392 Type *CurrentType =
IV->getType();
19395 for (
auto *Elt : ST->elements())
19396 if (Elt != ST->getElementType(0))
19397 return std::nullopt;
19398 AggregateSize *= ST->getNumElements();
19399 CurrentType = ST->getElementType(0);
19401 AggregateSize *= AT->getNumElements();
19402 CurrentType = AT->getElementType();
19404 AggregateSize *= VT->getNumElements();
19405 return AggregateSize;
19407 return AggregateSize;
19409 return std::nullopt;
19418 unsigned OperandOffset) {
19421 std::optional<unsigned> OperandIndex =
19427 BuildVectorOpds, InsertElts, *OperandIndex);
19430 BuildVectorOpds[*OperandIndex] = InsertedOperand;
19431 InsertElts[*OperandIndex] = LastInsertInst;
19434 }
while (LastInsertInst !=
nullptr &&
19460 "Expected insertelement or insertvalue instruction!");
19463 "Expected empty result vectors!");
19466 if (!AggregateSize)
19468 BuildVectorOpds.
resize(*AggregateSize);
19469 InsertElts.
resize(*AggregateSize);
19474 if (BuildVectorOpds.
size() >= 2)
19492 auto DominatedReduxValue = [&](
Value *R) {
19500 if (
P->getIncomingBlock(0) == ParentBB) {
19502 }
else if (
P->getIncomingBlock(1) == ParentBB) {
19506 if (Rdx && DominatedReduxValue(Rdx))
19519 if (
P->getIncomingBlock(0) == BBLatch) {
19521 }
else if (
P->getIncomingBlock(1) == BBLatch) {
19525 if (Rdx && DominatedReduxValue(Rdx))
19561 "Expected binop, select, or intrinsic for reduction matching");
19563 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
19565 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
19576 Value *Op0 =
nullptr;
19577 Value *Op1 =
nullptr;
19586 Value *B0 =
nullptr, *B1 =
nullptr;
19591bool SLPVectorizerPass::vectorizeHorReduction(
19602 auto SelectRoot = [&]() {
19621 std::queue<std::pair<Instruction *, unsigned>>
Stack;
19622 Stack.emplace(SelectRoot(), 0);
19626 if (
R.isAnalyzedReductionRoot(Inst))
19630 HorizontalReduction HorRdx;
19631 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
19633 return HorRdx.tryToReduce(R, *DL,
TTI, *TLI);
19635 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
19636 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
19648 while (!
Stack.empty()) {
19651 std::tie(Inst, Level) =
Stack.front();
19656 if (
R.isDeleted(Inst))
19658 if (
Value *VectorizedV = TryToReduce(Inst)) {
19662 Stack.emplace(
I, Level);
19665 if (
R.isDeleted(Inst))
19669 if (!TryAppendToPostponedInsts(Inst)) {
19680 if (VisitedInstrs.
insert(
Op).second)
19685 !
R.isDeleted(
I) &&
I->getParent() == BB)
19686 Stack.emplace(
I, Level);
19695 bool Res = vectorizeHorReduction(
P, Root, BB, R,
TTI, PostponedInsts);
19696 Res |= tryToVectorize(PostponedInsts, R);
19703 for (
Value *V : Insts)
19705 Res |= tryToVectorize(Inst, R);
19709bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
19712 if (!
R.canMapToVector(IVI->
getType()))
19720 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
19721 R.getORE()->emit([&]() {
19723 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
19724 "trying reduction first.";
19728 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
19730 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
19744 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
19745 R.getORE()->emit([&]() {
19747 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
19748 "trying reduction first.";
19752 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
19753 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
19756template <
typename T>
19761 bool MaxVFOnly,
BoUpSLP &R) {
19774 if (!
I || R.isDeleted(
I)) {
19778 auto *SameTypeIt = IncIt;
19781 AreCompatible(*SameTypeIt, *IncIt))) {
19784 if (
I && !R.isDeleted(
I))
19789 unsigned NumElts = VL.
size();
19790 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
19791 << NumElts <<
")\n");
19801 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
19804 VL.
swap(Candidates);
19805 Candidates.
clear();
19813 auto GetMinNumElements = [&R](
Value *V) {
19814 unsigned EltSize = R.getVectorElementSize(V);
19815 return std::max(2U, R.getMaxVecRegSize() / EltSize);
19817 if (NumElts < GetMinNumElements(*IncIt) &&
19818 (Candidates.
empty() ||
19819 Candidates.
front()->getType() == (*IncIt)->getType())) {
19827 if (Candidates.
size() > 1 &&
19828 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
19829 if (TryToVectorizeHelper(Candidates,
false)) {
19832 }
else if (MaxVFOnly) {
19835 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
19838 if (!
I || R.isDeleted(
I)) {
19842 auto *SameTypeIt = It;
19843 while (SameTypeIt !=
End &&
19846 AreCompatible(*SameTypeIt, *It))) {
19849 if (
I && !R.isDeleted(
I))
19852 unsigned NumElts = VL.
size();
19853 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
19859 Candidates.
clear();
19863 IncIt = SameTypeIt;
19875template <
bool IsCompatibility>
19880 "Expected valid element types only.");
19882 return IsCompatibility;
19885 if (CI1->getOperand(0)->getType()->getTypeID() <
19887 return !IsCompatibility;
19888 if (CI1->getOperand(0)->getType()->getTypeID() >
19891 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
19893 return !IsCompatibility;
19894 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
19903 if (BasePred1 < BasePred2)
19904 return !IsCompatibility;
19905 if (BasePred1 > BasePred2)
19908 bool CI1Preds = Pred1 == BasePred1;
19909 bool CI2Preds = Pred2 == BasePred1;
19910 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
19911 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
19912 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
19916 return !IsCompatibility;
19921 if (IsCompatibility) {
19922 if (I1->getParent() != I2->getParent())
19929 return NodeI2 !=
nullptr;
19932 assert((NodeI1 == NodeI2) ==
19934 "Different nodes should have different DFS numbers");
19935 if (NodeI1 != NodeI2)
19939 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
19941 if (IsCompatibility)
19943 if (I1->getOpcode() != I2->getOpcode())
19944 return I1->getOpcode() < I2->getOpcode();
19947 return IsCompatibility;
19950template <
typename ItT>
19956 if (
R.isDeleted(
I))
19960 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R,
TTI);
19964 if (
R.isDeleted(
I))
19976 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
19986 if (Vals.
size() <= 1)
19989 Vals, CompareSorter, AreCompatibleCompares,
19992 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
19994 auto *Select = dyn_cast<SelectInst>(U);
19996 Select->getParent() != cast<Instruction>(V)->getParent();
19999 if (ArePossiblyReducedInOtherBlock)
20001 return tryToVectorizeList(Candidates, R, MaxVFOnly);
20007bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
20010 "This function only accepts Insert instructions");
20011 bool OpsChanged =
false;
20013 for (
auto *
I :
reverse(Instructions)) {
20019 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
20022 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
20025 if (
R.isDeleted(
I))
20027 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R,
TTI, PostponedInsts);
20033 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
20035 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
20040 OpsChanged |= tryToVectorize(PostponedInsts, R);
20054 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
20057 "Expected vectorizable types only.");
20065 V2->getType()->getScalarSizeInBits())
20068 V2->getType()->getScalarSizeInBits())
20072 if (Opcodes1.
size() < Opcodes2.
size())
20074 if (Opcodes1.
size() > Opcodes2.
size())
20076 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
20085 return NodeI2 !=
nullptr;
20088 assert((NodeI1 == NodeI2) ==
20090 "Different nodes should have different DFS numbers");
20091 if (NodeI1 != NodeI2)
20094 if (S.getOpcode() && !S.isAltShuffle())
20096 return I1->getOpcode() < I2->getOpcode();
20119 auto ValID1 = Opcodes1[
I]->getValueID();
20120 auto ValID2 = Opcodes2[
I]->getValueID();
20121 if (ValID1 == ValID2)
20123 if (ValID1 < ValID2)
20125 if (ValID1 > ValID2)
20134 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
20138 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
20141 if (V1->getType() !=
V2->getType())
20145 if (Opcodes1.
size() != Opcodes2.
size())
20147 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
20153 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
20155 if (
I1->getParent() != I2->getParent())
20164 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
20170 bool HaveVectorizedPhiNodes =
false;
20181 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
20194 if (!Opcodes.
empty())
20198 while (!Nodes.
empty()) {
20202 for (
Value *V :
PHI->incoming_values()) {
20204 Nodes.push_back(PHI1);
20213 Incoming, PHICompare, AreCompatiblePHIs,
20215 return tryToVectorizeList(Candidates, R, MaxVFOnly);
20218 Changed |= HaveVectorizedPhiNodes;
20219 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
20221 return !
PHI ||
R.isDeleted(
PHI);
20223 PHIToOpcodes.
clear();
20225 }
while (HaveVectorizedPhiNodes);
20227 VisitedInstrs.
clear();
20229 InstSetVector PostProcessInserts;
20233 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
20234 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
20235 if (VectorizeCmps) {
20237 PostProcessCmps.
clear();
20239 PostProcessInserts.clear();
20245 return PostProcessCmps.
contains(Cmp);
20247 PostProcessInserts.contains(
I);
20253 return I->use_empty() &&
20263 if (
R.isDeleted(&*It))
20266 if (!VisitedInstrs.
insert(&*It).second) {
20267 if (HasNoUsers(&*It) &&
20268 VectorizeInsertsAndCmps(It->isTerminator())) {
20284 if (
P->getNumIncomingValues() == 2) {
20287 if (Root && vectorizeRootInstruction(
P, Root, BB, R,
TTI)) {
20301 if (BB ==
P->getIncomingBlock(
I) ||
20308 PI && !IsInPostProcessInstrs(PI)) {
20309 bool Res = vectorizeRootInstruction(
nullptr, PI,
20310 P->getIncomingBlock(
I), R,
TTI);
20312 if (Res &&
R.isDeleted(
P)) {
20322 if (HasNoUsers(&*It)) {
20323 bool OpsChanged =
false;
20334 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
20335 SI->getValueOperand()->hasOneUse();
20337 if (TryToVectorizeRoot) {
20338 for (
auto *V : It->operand_values()) {
20342 VI && !IsInPostProcessInstrs(VI))
20344 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R,
TTI);
20351 VectorizeInsertsAndCmps(It->isTerminator());
20363 PostProcessInserts.insert(&*It);
20373 for (
auto &Entry : GEPs) {
20376 if (Entry.second.size() < 2)
20379 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
20380 << Entry.second.size() <<
".\n");
20388 return !R.isDeleted(GEP);
20390 if (It == Entry.second.end())
20392 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
20393 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
20394 if (MaxVecRegSize < EltSize)
20397 unsigned MaxElts = MaxVecRegSize / EltSize;
20398 for (
unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
20399 auto Len = std::min<unsigned>(BE - BI, MaxElts);
20412 Candidates.remove_if([&R](
Value *
I) {
20422 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
20423 auto *GEPI = GEPList[
I];
20424 if (!Candidates.count(GEPI))
20427 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
20428 auto *GEPJ = GEPList[J];
20431 Candidates.remove(GEPI);
20432 Candidates.remove(GEPJ);
20433 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
20434 Candidates.remove(GEPJ);
20441 if (Candidates.
size() < 2)
20448 auto BundleIndex = 0
u;
20449 for (
auto *V : Candidates) {
20451 auto *GEPIdx =
GEP->idx_begin()->get();
20453 Bundle[BundleIndex++] = GEPIdx;
20465 Changed |= tryToVectorizeList(Bundle, R);
20471bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
20477 if (V->getValueOperand()->getType()->getTypeID() <
20478 V2->getValueOperand()->getType()->getTypeID())
20480 if (V->getValueOperand()->getType()->getTypeID() >
20481 V2->getValueOperand()->getType()->getTypeID())
20483 if (V->getPointerOperandType()->getTypeID() <
20484 V2->getPointerOperandType()->getTypeID())
20486 if (V->getPointerOperandType()->getTypeID() >
20487 V2->getPointerOperandType()->getTypeID())
20489 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
20490 V2->getValueOperand()->getType()->getScalarSizeInBits())
20492 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
20493 V2->getValueOperand()->getType()->getScalarSizeInBits())
20504 DT->
getNode(I2->getParent());
20505 assert(NodeI1 &&
"Should only process reachable instructions");
20506 assert(NodeI2 &&
"Should only process reachable instructions");
20507 assert((NodeI1 == NodeI2) ==
20509 "Different nodes should have different DFS numbers");
20510 if (NodeI1 != NodeI2)
20515 return I1->getOpcode() < I2->getOpcode();
20520 return V->getValueOperand()->getValueID() <
20521 V2->getValueOperand()->getValueID();
20537 if (
I1->getParent() != I2->getParent())
20540 return S.getOpcode() > 0;
20546 V2->getValueOperand()->getValueID();
20551 for (
auto &Pair : Stores) {
20552 if (Pair.second.size() < 2)
20556 << Pair.second.size() <<
".\n");
20565 Pair.second.rend());
20567 ReversedStores, StoreSorter, AreCompatibleStores,
20569 return vectorizeStores(Candidates, R, Attempted);
aarch64 AArch64 CCMP Pass
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Loop::LoopBounds::Direction Direction
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
return ToRemove size() > 0
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
const MachineOperand & RHS
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Type * getValueType(T *V)
Returns the type of the given value/instruction V.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::reverse_iterator reverse_iterator
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
iterator_range< bundle_op_iterator > bundle_op_infos()
Return the range [bundle_op_info_begin, bundle_op_info_end).
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isSafeToRemove() const LLVM_READONLY
Return true if the instruction can be removed if the result is unused.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
size_type count(const KeyT &Key) const
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
typename SuperClass::iterator iterator
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
const Value * stripInBoundsConstantOffsets() const
Strip off pointer casts and all-constant inbounds GEPs.
bool hasOneUse() const
Return true if there is exactly one use of this value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
Instruction & front() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto cast_or_null(const Y &Val)
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
auto dyn_cast_or_null(const Y &Val)
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const